2020-04-19 19:57:13 +02:00
#! /usr/bin/env python3
2020-04-24 03:56:39 +02:00
2020-04-19 19:57:13 +02:00
"""
2020-04-26 00:56:24 +02:00
mkv - this : input text and / or url , output markovified text .
Copyright ( C ) 2020 martianhiatus @riseup.net .
2020-04-19 19:57:13 +02:00
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < https : / / www . gnu . org / licenses / > .
"""
2020-04-26 01:57:16 +02:00
2020-04-26 00:56:24 +02:00
2020-04-26 00:56:24 +02:00
import os
2020-04-24 03:06:50 +02:00
import sys
import argparse
2020-04-26 01:59:52 +02:00
from . functions import URL , convert_html , read , mkbtext , mkbnewline , writesentence , writeshortsentence
2020-04-24 03:06:50 +02:00
2020-04-24 17:32:27 +02:00
# argparse
def parse_the_args ( ) :
2020-04-26 00:56:24 +02:00
parser = argparse . ArgumentParser ( prog = " mkv-this " , description = " markovify local text files or URLs and output the results to a local text file. " ,
2020-04-23 20:53:52 +02:00
epilog = " may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. " )
2020-04-19 19:57:13 +02:00
# positional args:
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 23:11:34 +02:00
' infile ' , help = " the text file to process. NB: file cannot be empty. " )
2020-04-23 20:53:52 +02:00
parser . add_argument ( ' outfile ' , nargs = ' ? ' , default = " ./mkv-output.txt " ,
2020-04-24 23:11:34 +02:00
help = " the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt. " )
2020-04-19 19:57:13 +02:00
# optional args:
2020-04-24 03:06:50 +02:00
parser . add_argument ( ' -s ' , ' --state-size ' , help = " the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect. " , type = int , default = 2 )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-26 00:56:24 +02:00
' -n ' , ' --sentences ' , help = " the number of ' sentences ' to output. defaults to 5. NB: if your text has no initial caps, a ' sentence ' will be a paragraph. " , type = int , default = 5 )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -l ' , ' --length ' , help = " set maximum number of characters per sentence. " , type = int )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -o ' , ' --overlap ' , help = " the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5 " , type = float , default = 0.5 )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -c ' , ' --combine ' , help = " provide an another text file to be combined with the first item. " )
2020-04-23 20:53:52 +02:00
parser . add_argument ( ' -C ' , ' --combine-URL ' ,
2020-04-24 23:11:34 +02:00
help = " provide a URL to be combined with the first item " )
2020-04-24 03:06:50 +02:00
parser . add_argument ( ' -w ' , ' --weight ' , help = " specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less. " , type = float , default = 1 )
2020-04-23 02:05:16 +02:00
2020-04-23 20:53:52 +02:00
# switches
2020-04-24 17:32:27 +02:00
parser . add_argument (
2020-04-24 23:11:34 +02:00
' -u ' , ' --URL ' , help = " infile is a URL instead. " , action = ' store_true ' )
2020-04-26 03:50:04 +02:00
# store_false = default to True.
parser . add_argument ( ' -f ' , ' --well-formed ' ,
help = " enforce ' well_formed ' : discard sentences containing [] {} () " " ' ' from the markov model. use if output is filthy. " , action = ' store_true ' )
2020-04-23 20:53:52 +02:00
parser . add_argument (
' --newline ' , help = " sentences in input file end with newlines \
2020-04-24 03:06:50 +02:00
rather than full stops . " , action= ' store_true ' )
2020-04-21 15:45:52 +02:00
# store_true = default to False, become True if flagged.
2020-04-19 19:57:13 +02:00
2020-04-24 17:32:27 +02:00
return parser . parse_args ( )
2020-04-24 18:37:16 +02:00
2020-04-26 00:56:24 +02:00
# make args avail:
2020-04-24 17:32:27 +02:00
args = parse_the_args ( )
2020-04-23 20:53:52 +02:00
2020-04-24 17:32:27 +02:00
def main ( ) :
2020-04-24 18:37:16 +02:00
# if a -c/-C, combine it w infile/URL:
2020-04-23 02:05:16 +02:00
if args . combine or args . combine_URL :
if args . combine :
2020-04-24 03:56:39 +02:00
# get raw text as a string for both:
2020-04-24 18:37:16 +02:00
# infile is URL:
if args . URL :
2020-04-24 23:11:34 +02:00
html = URL ( args . infile )
text = convert_html ( html )
2020-04-25 02:44:51 +02:00
# or normal:
2020-04-24 18:37:16 +02:00
else :
text = read ( args . infile )
# read -c file:
ctext = read ( args . combine )
2020-04-23 02:05:16 +02:00
2020-04-24 03:56:39 +02:00
# if -C, combine it w infile/URL:
2020-04-23 02:05:16 +02:00
elif args . combine_URL :
2020-04-24 18:37:16 +02:00
# infile is URL:
2020-04-24 17:32:27 +02:00
if args . URL :
2020-04-24 23:11:34 +02:00
html = URL ( args . infile )
text = convert_html ( html )
2020-04-24 03:56:39 +02:00
# or normal:
2020-04-24 17:32:27 +02:00
else :
text = read ( args . infile )
2020-04-23 02:05:16 +02:00
# now combine_URL:
2020-04-24 23:11:34 +02:00
html = URL ( args . combine_URL )
ctext = convert_html ( html )
2020-04-19 19:57:13 +02:00
2020-04-24 17:32:27 +02:00
# build the models + a combined model:
2020-04-24 03:56:39 +02:00
# with --newline:
2020-04-23 20:53:52 +02:00
if args . newline :
2020-04-26 00:56:24 +02:00
text_model = mkbnewline ( text , args . state_size , args . well_formed )
ctext_model = mkbnewline ( ctext , args . state_size , args . well_formed )
2020-04-24 03:56:39 +02:00
# no --newline:
2020-04-20 17:21:18 +02:00
else :
2020-04-26 00:56:24 +02:00
text_model = mkbtext ( text , args . state_size , args . well_formed )
ctext_model = mkbtext ( ctext , args . state_size , args . well_formed )
2020-04-19 19:57:13 +02:00
2020-04-23 20:53:52 +02:00
combo_model = markovify . combine (
[ text_model , ctext_model ] , [ 1 , args . weight ] )
2020-04-19 19:57:13 +02:00
2020-04-26 00:56:24 +02:00
# write it combo!
if args . length :
2020-04-26 03:50:04 +02:00
writeshortsentence ( combo_model , args . sentences ,
args . outfile , args . overlap , args . length )
2020-04-26 00:56:24 +02:00
else :
2020-04-26 03:50:04 +02:00
writesentence ( combo_model , args . sentences ,
args . outfile , args . overlap , args . length )
2020-04-24 18:37:16 +02:00
2020-04-24 03:56:39 +02:00
# if no -c/-C, do normal:
2020-04-19 19:57:13 +02:00
else :
2020-04-20 17:21:18 +02:00
# Get raw text as string.
2020-04-24 03:56:39 +02:00
# either URL:
2020-04-22 23:49:24 +02:00
if args . URL :
2020-04-24 23:11:34 +02:00
html = URL ( args . infile )
text = convert_html ( html )
2020-04-24 03:56:39 +02:00
# or local:
2020-04-22 23:49:24 +02:00
else :
2020-04-24 17:32:27 +02:00
text = read ( args . infile )
2020-04-20 17:21:18 +02:00
2020-04-23 20:53:52 +02:00
# Build the model:
# if --newline:
if args . newline :
2020-04-26 00:56:24 +02:00
text_model = mkbnewline ( text , args . state_size , args . well_formed )
2020-04-23 20:53:52 +02:00
# no --newline:
2020-04-20 17:21:18 +02:00
else :
2020-04-26 00:56:24 +02:00
text_model = mkbtext ( text , args . state_size , args . well_formed )
2020-04-24 18:37:16 +02:00
2020-04-26 00:56:24 +02:00
# write it!
if args . length :
2020-04-26 03:50:04 +02:00
writeshortsentence ( text_model , args . sentences ,
args . outfile , args . overlap , args . length )
2020-04-26 00:56:24 +02:00
else :
2020-04-26 03:50:04 +02:00
writesentence ( text_model , args . sentences ,
args . outfile , args . overlap , args . length )
2020-04-19 19:57:13 +02:00
2020-04-26 00:56:24 +02:00
print ( ' \n : : \n ' )
2020-04-23 02:05:16 +02:00
for key , value in vars ( args ) . items ( ) :
2020-04-23 20:53:52 +02:00
print ( ' : ' + key . ljust ( 15 , ' ' ) + ' : ' + str ( value ) . ljust ( 10 ) )
if os . path . isfile ( args . outfile ) :
2020-04-24 23:11:34 +02:00
print ( " \n : literary genius has been written to the file "
+ args . outfile + " . thanks for playing! \n \n : ' Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected... ' " )
2020-04-23 20:53:52 +02:00
else :
2020-04-24 23:11:34 +02:00
print ( ' : mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz! ' )
2020-04-23 02:05:16 +02:00
sys . exit ( )
2020-04-19 19:57:13 +02:00
2020-04-23 20:53:52 +02:00
2020-04-24 17:32:27 +02:00
if __name__ == ' __main__ ' :
main ( )