2020-04-19 19:57:13 +02:00
#! /usr/bin/env python3
2020-04-24 03:56:39 +02:00
2020-04-19 19:57:13 +02:00
"""
2020-04-30 18:46:32 +02:00
mkv - this : input a text file , directory , url and / or pdf , output markovified text .
2020-04-26 00:56:24 +02:00
Copyright ( C ) 2020 martianhiatus @riseup.net .
2020-04-19 19:57:13 +02:00
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < https : / / www . gnu . org / licenses / > .
"""
2020-04-26 01:57:16 +02:00
2020-04-26 00:56:24 +02:00
2020-04-24 03:06:50 +02:00
import markovify
2020-04-26 00:56:24 +02:00
import os
2020-04-24 03:06:50 +02:00
import sys
2020-04-29 02:57:38 +02:00
import datetime
2020-04-24 03:06:50 +02:00
import argparse
2020-04-26 22:04:23 +02:00
from . functions import (
2020-04-30 18:46:32 +02:00
url ,
2020-04-26 22:04:23 +02:00
convert_html ,
dir_list ,
dir_cat ,
read ,
mkbtext ,
mkbnewline ,
writesentence ,
writeshortsentence ,
2020-04-30 17:13:53 +02:00
convert_pdf ,
2020-04-26 22:04:23 +02:00
)
2020-04-24 03:06:50 +02:00
2020-04-24 17:32:27 +02:00
# argparse
def parse_the_args ( ) :
2020-04-26 16:42:48 +02:00
parser = argparse . ArgumentParser (
prog = " mkv-this " ,
2020-04-29 02:57:38 +02:00
description = " markovify local text files, directory, or URLs and output the results to a local text file. " ,
epilog = " may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. \n ' `mkv-this` is a waste product of machine—machine interactions become the historical record. ' " ,
2020-04-26 16:42:48 +02:00
)
2020-04-19 19:57:13 +02:00
# positional args:
2020-04-29 02:57:38 +02:00
parser . add_argument ( " infile " , help = " the text file to process. " )
2020-04-26 16:42:48 +02:00
parser . add_argument (
2020-04-26 22:04:23 +02:00
" outfile " ,
nargs = " ? " ,
2020-04-26 16:42:48 +02:00
default = " ./mkv-output.txt " ,
2020-04-26 22:04:23 +02:00
help = " the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt. " ,
2020-04-26 16:42:48 +02:00
)
2020-04-19 19:57:13 +02:00
# optional args:
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-30 18:46:32 +02:00
" -u " , " --url " , help = " infile is a URL. " , action = " store_true " ,
2020-04-26 16:42:48 +02:00
)
parser . add_argument (
2020-04-26 22:13:31 +02:00
" -d " ,
" --directory " ,
2020-04-29 02:57:38 +02:00
help = " infile is a directory. all text files in it and its subdirectories will be used. " ,
2020-04-26 22:13:31 +02:00
action = " store_true " ,
)
2020-04-30 17:13:53 +02:00
parser . add_argument (
" -P " ,
" --pdf " ,
2020-04-30 18:46:32 +02:00
help = " infile is a pdf. NB: for this to work you need to install pdfminer with pip. " ,
2020-04-30 17:13:53 +02:00
action = " store_true " ,
)
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-26 22:04:23 +02:00
" -s " ,
" --state-size " ,
help = " the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect. " ,
2020-04-26 16:42:48 +02:00
type = int ,
2020-04-26 22:04:23 +02:00
default = 2 ,
2020-04-26 16:42:48 +02:00
)
parser . add_argument (
2020-04-26 22:04:23 +02:00
" -n " ,
" --sentences " ,
help = " the number of ' sentences ' to output. defaults to 5. NB: if your text has no initial caps, a ' sentence ' will be a paragraph. " ,
2020-04-26 16:42:48 +02:00
type = int ,
2020-04-26 22:04:23 +02:00
default = 5 ,
)
parser . add_argument (
" -l " ,
" --length " ,
2020-04-26 16:42:48 +02:00
help = " set maximum number of characters per sentence. " ,
2020-04-26 22:04:23 +02:00
type = int ,
2020-04-26 16:42:48 +02:00
)
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-26 22:04:23 +02:00
" -o " ,
" --overlap " ,
2020-04-30 17:13:53 +02:00
help = " the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5. " ,
2020-04-26 22:04:23 +02:00
type = float ,
default = 0.5 ,
2020-04-26 16:42:48 +02:00
)
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-26 22:04:23 +02:00
" -c " ,
" --combine " ,
help = " provide an another text file to be combined with the first item. " ,
2020-04-26 16:42:48 +02:00
)
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-30 18:46:32 +02:00
" -C " , " --combine-url " , help = " provide a URL to be combined with the first item. "
)
parser . add_argument (
" -K " ,
" --combine-pdf " ,
help = " provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip. " ,
2020-04-26 16:42:48 +02:00
)
parser . add_argument (
2020-04-26 22:04:23 +02:00
" -w " ,
" --weight " ,
2020-04-26 16:42:48 +02:00
help = " specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less. " ,
2020-04-26 22:04:23 +02:00
type = float ,
default = 1 ,
2020-04-26 16:42:48 +02:00
)
2020-04-24 17:32:27 +02:00
parser . add_argument (
2020-04-26 22:04:23 +02:00
" -f " ,
" --well-formed " ,
help = " enforce ' well_formed ' : discard sentences containing [] {} () "
" ' ' from the markov model. use if output is filthy. " ,
action = " store_true " ,
2020-04-26 16:42:48 +02:00
)
2020-04-26 22:04:23 +02:00
# store_true = default to False.
2020-04-23 20:53:52 +02:00
parser . add_argument (
2020-04-26 22:04:23 +02:00
" --newline " ,
2020-04-29 02:57:38 +02:00
help = " sentences in input file end with newlines rather than full stops. " ,
action = " store_true " ,
)
# store_true = default to False, True if flagged.
parser . add_argument (
" -t " ,
" --timestamp " ,
help = " add date and time to the file before the output. " ,
action = " store_true " ,
)
parser . add_argument (
" -p " ,
" --save-options " ,
help = " add a brief summary of options used before the output. " ,
2020-04-26 22:04:23 +02:00
action = " store_true " ,
2020-04-26 16:42:48 +02:00
)
2020-04-19 19:57:13 +02:00
2020-04-24 17:32:27 +02:00
return parser . parse_args ( )
2020-04-24 18:37:16 +02:00
2020-04-26 00:56:24 +02:00
# make args avail:
2020-04-24 17:32:27 +02:00
args = parse_the_args ( )
2020-04-23 20:53:52 +02:00
2020-04-24 17:32:27 +02:00
def main ( ) :
2020-04-30 18:46:32 +02:00
# get raw text as a string for infile and -c/C if exists:
# infile is url:
if args . url :
html = url ( args . infile )
text = convert_html ( html )
# infile is dir:
elif args . directory :
matchlist = dir_list ( args . infile )
# place batchfile.txt in user-given directory:
batchfile = args . infile + os . path . sep + " batchfile.txt "
dir_cat ( matchlist , batchfile )
text = read ( batchfile )
os . unlink ( batchfile )
# infile is pdf:
elif args . pdf :
text = convert_pdf ( args . infile )
# or normal:
else :
text = read ( args . infile )
if args . combine :
ctext = read ( args . combine )
if args . combine_url :
html = url ( args . combine_url )
ctext = convert_html ( html )
if args . combine_pdf :
ctext = convert_pdf ( args . combine_pdf )
# build combined model:
if args . combine or args . combine_url or args . combine_pdf :
2020-04-24 03:56:39 +02:00
# with --newline:
2020-04-23 20:53:52 +02:00
if args . newline :
2020-04-26 00:56:24 +02:00
text_model = mkbnewline ( text , args . state_size , args . well_formed )
ctext_model = mkbnewline ( ctext , args . state_size , args . well_formed )
2020-04-24 03:56:39 +02:00
# no --newline:
2020-04-20 17:21:18 +02:00
else :
2020-04-26 00:56:24 +02:00
text_model = mkbtext ( text , args . state_size , args . well_formed )
ctext_model = mkbtext ( ctext , args . state_size , args . well_formed )
2020-04-26 22:04:23 +02:00
combo_model = markovify . combine ( [ text_model , ctext_model ] , [ 1 , args . weight ] )
2020-04-30 18:46:32 +02:00
# build normal model:
2020-04-19 19:57:13 +02:00
else :
2020-04-30 18:46:32 +02:00
# with --newline:
2020-04-23 20:53:52 +02:00
if args . newline :
2020-04-26 00:56:24 +02:00
text_model = mkbnewline ( text , args . state_size , args . well_formed )
2020-04-23 20:53:52 +02:00
# no --newline:
2020-04-20 17:21:18 +02:00
else :
2020-04-26 00:56:24 +02:00
text_model = mkbtext ( text , args . state_size , args . well_formed )
2020-04-24 18:37:16 +02:00
2020-04-30 18:46:32 +02:00
# prepare to write:
if args . combine or args . combine_url or args . combine_pdf :
2020-04-29 02:57:38 +02:00
model = combo_model
else :
model = text_model
if args . length :
write = writeshortsentence
else :
write = writesentence
2020-04-30 18:46:32 +02:00
# print optional headers in file:
2020-04-29 02:57:38 +02:00
with open ( args . outfile , " a " ) as outp :
# optional print timestamp header:
if args . timestamp :
outp . write ( str ( datetime . datetime . now ( ) ) + " : \n " )
# optional print options used header:
if args . save_options :
2020-04-30 16:43:01 +02:00
outp . write ( " in: " + vars ( args ) [ " infile " ] + " | " )
2020-04-29 02:57:38 +02:00
if args . combine :
outp . write ( " comb: " + vars ( args ) [ " combine " ] + " | " )
2020-04-30 18:46:32 +02:00
if args . combine_url :
outp . write ( " comb: " + vars ( args ) [ " combine_url " ] + " | " )
if args . combine_pdf :
outp . write ( " comb: " + vars ( args ) [ " combine_pdf " ] + " | " )
if args . combine or args . combine_url or args . combine_pdf :
2020-04-29 02:57:38 +02:00
outp . write ( " weight: " + str ( vars ( args ) [ " weight " ] ) + " | " )
2020-04-30 16:43:01 +02:00
outp . write ( " overlap: " + str ( vars ( args ) [ " overlap " ] ) + " | " )
2020-04-30 17:13:53 +02:00
outp . write ( " state size: " + str ( vars ( args ) [ " state_size " ] ) + " \n " )
2020-04-29 02:57:38 +02:00
outp . write ( " \n " )
# write it!
write ( model , args . sentences , args . outfile , args . overlap , args . length )
2020-04-19 19:57:13 +02:00
2020-04-30 18:46:32 +02:00
# wrap up:
2020-04-26 22:04:23 +02:00
print ( " \n : : \n " )
2020-04-23 02:05:16 +02:00
for key , value in vars ( args ) . items ( ) :
2020-04-26 22:04:23 +02:00
print ( " : " + key . ljust ( 15 , " " ) + " : " + str ( value ) . ljust ( 10 ) )
2020-04-23 20:53:52 +02:00
if os . path . isfile ( args . outfile ) :
2020-04-26 22:04:23 +02:00
print (
2020-04-30 16:43:01 +02:00
" \n : literary genius has been written to the file ' "
2020-04-26 22:04:23 +02:00
+ args . outfile
2020-04-30 16:43:01 +02:00
+ " ' . thanks for playing! \n \n : ' Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected... ' "
2020-04-26 22:04:23 +02:00
)
2020-04-23 20:53:52 +02:00
else :
2020-04-26 22:04:23 +02:00
print (
" : mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz! "
)
2020-04-23 02:05:16 +02:00
sys . exit ( )