2020-04-19 19:57:13 +02:00
#! /usr/bin/env python3
2020-04-24 03:56:39 +02:00
2020-04-19 19:57:13 +02:00
"""
mkv - this : input text , output markovified text .
Copyright ( C ) 2020 mousebot @riseup.net .
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < https : / / www . gnu . org / licenses / > .
"""
"""
2020-04-23 20:53:52 +02:00
a ( very basic ) script to markovify local and / or remote text files and
output a user - specified number of sentences to a local text file .
see - - help for other options .
2020-04-19 19:57:13 +02:00
"""
2020-04-23 02:05:16 +02:00
2020-04-24 03:06:50 +02:00
import os
2020-04-25 02:44:51 +02:00
import re
2020-04-24 03:06:50 +02:00
import requests
import markovify
import sys
import argparse
2020-04-24 23:11:34 +02:00
import html2text
2020-04-24 03:06:50 +02:00
2020-04-24 17:32:27 +02:00
# argparse
def parse_the_args ( ) :
2020-04-24 03:06:50 +02:00
parser = argparse . ArgumentParser ( prog = " mkv-this " , description = " markovify one or two local or remote text files and output the results to a local text file. " ,
2020-04-23 20:53:52 +02:00
epilog = " may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. " )
2020-04-19 19:57:13 +02:00
# positional args:
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 23:11:34 +02:00
' infile ' , help = " the text file to process. NB: file cannot be empty. " )
2020-04-23 20:53:52 +02:00
parser . add_argument ( ' outfile ' , nargs = ' ? ' , default = " ./mkv-output.txt " ,
2020-04-24 23:11:34 +02:00
help = " the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt. " )
2020-04-19 19:57:13 +02:00
# optional args:
2020-04-24 03:06:50 +02:00
parser . add_argument ( ' -s ' , ' --state-size ' , help = " the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect. " , type = int , default = 2 )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -n ' , ' --sentences ' , help = " the number of ' sentences ' to output. defaults to 5. " , type = int , default = 5 )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -l ' , ' --length ' , help = " set maximum number of characters per sentence. " , type = int )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -o ' , ' --overlap ' , help = " the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5 " , type = float , default = 0.5 )
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-24 03:06:50 +02:00
' -c ' , ' --combine ' , help = " provide an another text file to be combined with the first item. " )
2020-04-23 20:53:52 +02:00
parser . add_argument ( ' -C ' , ' --combine-URL ' ,
2020-04-24 23:11:34 +02:00
help = " provide a URL to be combined with the first item " )
2020-04-24 03:06:50 +02:00
parser . add_argument ( ' -w ' , ' --weight ' , help = " specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less. " , type = float , default = 1 )
2020-04-23 02:05:16 +02:00
2020-04-23 20:53:52 +02:00
# switches
2020-04-24 17:32:27 +02:00
parser . add_argument (
2020-04-24 23:11:34 +02:00
' -u ' , ' --URL ' , help = " infile is a URL instead. " , action = ' store_true ' )
2020-04-25 02:44:51 +02:00
parser . add_argument ( ' -f ' , ' --well-formed ' , help = " enforce ' well_formed ' : discard sentences containing [] {} () " " ' ' from the markov model. use if output is filthy. " , action = ' store_true ' ) # store_false = default to True.
2020-04-23 20:53:52 +02:00
parser . add_argument (
' --newline ' , help = " sentences in input file end with newlines \
2020-04-24 03:06:50 +02:00
rather than full stops . " , action= ' store_true ' )
2020-04-21 15:45:52 +02:00
# store_true = default to False, become True if flagged.
2020-04-19 19:57:13 +02:00
2020-04-24 17:32:27 +02:00
return parser . parse_args ( )
2020-04-24 18:37:16 +02:00
2020-04-24 23:11:34 +02:00
# fetch/read/build/write fns:
2020-04-24 18:37:16 +02:00
2020-04-24 17:32:27 +02:00
def URL ( insert ) :
try :
req = requests . get ( insert )
req . raise_for_status ( )
except Exception as exc :
2020-04-24 23:11:34 +02:00
print ( f ' : There was a problem: { exc } . \n : Please enter a valid URL ' )
2020-04-24 17:32:27 +02:00
sys . exit ( )
else :
2020-04-24 23:11:34 +02:00
print ( ' : fetched URL. ' )
2020-04-24 17:32:27 +02:00
return req . text
2020-04-23 20:53:52 +02:00
2020-04-24 18:37:16 +02:00
2020-04-24 23:11:34 +02:00
def convert_html ( html ) :
h2t = html2text . HTML2Text ( )
h2t . ignore_links = True
2020-04-25 02:44:51 +02:00
h2t . images_to_alt = True
2020-04-25 02:10:06 +02:00
h2t . ignore_emphasis = True
2020-04-25 02:44:51 +02:00
h2t . ignore_tables = True
h2t . unicode_snob = True
h2t . decode_errors = ' ignore '
h2t . escape_all = False # remove all noise if needed
2020-04-24 23:11:34 +02:00
print ( ' : URL converted to text ' )
2020-04-25 02:44:51 +02:00
s = h2t . handle ( html )
s = re . sub ( ' [#*] ' , ' ' , s ) # remove hashes and stars from the 'markdown'
return s
2020-04-24 23:11:34 +02:00
2020-04-24 17:32:27 +02:00
def read ( infile ) :
try :
with open ( infile , encoding = " utf-8 " ) as f :
return f . read ( )
except UnicodeDecodeError :
2020-04-23 20:53:52 +02:00
with open ( infile , encoding = " latin-1 " ) as f :
return f . read ( )
2020-04-24 17:32:27 +02:00
except FileNotFoundError :
print ( fnf )
sys . exit ( )
2020-04-24 18:37:16 +02:00
2020-04-24 17:32:27 +02:00
def mkbtext ( texttype ) :
return markovify . Text ( texttype , state_size = args . state_size ,
2020-04-25 02:44:51 +02:00
well_formed = args . well_formed )
2020-04-24 17:32:27 +02:00
2020-04-24 18:37:16 +02:00
2020-04-24 17:32:27 +02:00
def mkbnewline ( texttype ) :
return markovify . NewlineText ( texttype , state_size = args . state_size ,
2020-04-25 02:44:51 +02:00
well_formed = args . well_formed )
2020-04-24 18:37:16 +02:00
2020-04-24 17:32:27 +02:00
def writesentence ( tmodel ) :
for i in range ( args . sentences ) :
output = open ( args . outfile , ' a ' ) # append
# short:
2020-04-24 18:37:16 +02:00
if args . length :
2020-04-24 17:32:27 +02:00
output . write ( str ( tmodel . make_short_sentence (
2020-04-24 18:37:16 +02:00
tries = 2000 , max_overlap_ratio = args . overlap ,
max_chars = args . length ) ) + ' \n \n ' )
2020-04-24 17:32:27 +02:00
# normal:
else :
output . write ( str ( tmodel . make_sentence (
2020-04-24 18:37:16 +02:00
tries = 2000 , max_overlap_ratio = args . overlap ,
max_chars = args . length ) ) + ' \n \n ' )
2020-04-24 17:32:27 +02:00
output . write ( str ( ' * \n \n ' ) )
output . close ( )
2020-04-24 18:37:16 +02:00
2020-04-24 17:32:27 +02:00
# make args + fnf avail to all:
args = parse_the_args ( )
2020-04-24 23:11:34 +02:00
fnf = ' : error: file not found. please provide a path to a really-existing \
2020-04-23 20:53:52 +02:00
file ! '
2020-04-24 17:32:27 +02:00
def main ( ) :
2020-04-24 18:37:16 +02:00
# if a -c/-C, combine it w infile/URL:
2020-04-23 02:05:16 +02:00
if args . combine or args . combine_URL :
if args . combine :
2020-04-24 03:56:39 +02:00
# get raw text as a string for both:
2020-04-24 18:37:16 +02:00
# try:
# infile is URL:
if args . URL :
2020-04-24 23:11:34 +02:00
html = URL ( args . infile )
text = convert_html ( html )
2020-04-25 02:44:51 +02:00
# or normal:
2020-04-24 18:37:16 +02:00
else :
text = read ( args . infile )
# read -c file:
ctext = read ( args . combine )
2020-04-23 02:05:16 +02:00
2020-04-24 03:56:39 +02:00
# if -C, combine it w infile/URL:
2020-04-23 02:05:16 +02:00
elif args . combine_URL :
2020-04-24 18:37:16 +02:00
# try:
# infile is URL:
2020-04-24 17:32:27 +02:00
if args . URL :
2020-04-24 23:11:34 +02:00
html = URL ( args . infile )
text = convert_html ( html )
2020-04-24 03:56:39 +02:00
# or normal:
2020-04-24 17:32:27 +02:00
else :
text = read ( args . infile )
2020-04-23 02:05:16 +02:00
# now combine_URL:
2020-04-24 23:11:34 +02:00
html = URL ( args . combine_URL )
ctext = convert_html ( html )
2020-04-19 19:57:13 +02:00
2020-04-24 17:32:27 +02:00
# build the models + a combined model:
2020-04-24 03:56:39 +02:00
# with --newline:
2020-04-23 20:53:52 +02:00
if args . newline :
2020-04-24 17:32:27 +02:00
text_model = mkbnewline ( text )
ctext_model = mkbnewline ( ctext )
2020-04-24 03:56:39 +02:00
# no --newline:
2020-04-20 17:21:18 +02:00
else :
2020-04-24 17:32:27 +02:00
text_model = mkbtext ( text )
ctext_model = mkbtext ( ctext )
2020-04-19 19:57:13 +02:00
2020-04-23 20:53:52 +02:00
combo_model = markovify . combine (
[ text_model , ctext_model ] , [ 1 , args . weight ] )
2020-04-19 19:57:13 +02:00
2020-04-24 03:56:39 +02:00
writesentence ( combo_model )
2020-04-24 18:37:16 +02:00
2020-04-24 03:56:39 +02:00
# if no -c/-C, do normal:
2020-04-19 19:57:13 +02:00
else :
2020-04-20 17:21:18 +02:00
# Get raw text as string.
2020-04-24 03:56:39 +02:00
# either URL:
2020-04-22 23:49:24 +02:00
if args . URL :
2020-04-24 23:11:34 +02:00
html = URL ( args . infile )
text = convert_html ( html )
2020-04-24 03:56:39 +02:00
# or local:
2020-04-22 23:49:24 +02:00
else :
2020-04-24 17:32:27 +02:00
text = read ( args . infile )
2020-04-20 17:21:18 +02:00
2020-04-23 20:53:52 +02:00
# Build the model:
# if --newline:
if args . newline :
text_model = mkbnewline ( text )
# no --newline:
2020-04-20 17:21:18 +02:00
else :
2020-04-24 18:37:16 +02:00
text_model = mkbtext ( text )
2020-04-24 03:56:39 +02:00
writesentence ( text_model )
2020-04-19 19:57:13 +02:00
2020-04-23 02:05:16 +02:00
print ( ' \n : The options you used are as follows: \n ' )
for key , value in vars ( args ) . items ( ) :
2020-04-23 20:53:52 +02:00
print ( ' : ' + key . ljust ( 15 , ' ' ) + ' : ' + str ( value ) . ljust ( 10 ) )
if os . path . isfile ( args . outfile ) :
2020-04-24 23:11:34 +02:00
print ( " \n : literary genius has been written to the file "
+ args . outfile + " . thanks for playing! \n \n : ' Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected... ' " )
2020-04-23 20:53:52 +02:00
else :
2020-04-24 23:11:34 +02:00
print ( ' : mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz! ' )
2020-04-23 02:05:16 +02:00
sys . exit ( )
2020-04-19 19:57:13 +02:00
2020-04-23 20:53:52 +02:00
2020-04-24 17:32:27 +02:00
# for testing:
if __name__ == ' __main__ ' :
main ( )