#! /usr/bin/env python3 """ mkv-this: input a text file, directory, url and/or pdf, output markovified text. Copyright (C) 2020 martianhiatus@riseup.net. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import markovify import os import sys import datetime import argparse from .functions import ( url, convert_html, dir_list, dir_cat, read, mkbtext, mkbnewline, writesentence, writeshortsentence, convert_pdf, ) # argparse def parse_the_args(): parser = argparse.ArgumentParser( prog="mkv-this", description="markovify local text files, directory, or URLs and output the results to a local text file.", epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.\n '`mkv-this` is a waste product of machine—machine interactions become the historical record.'", ) # positional args: parser.add_argument("infile", help="the text file to process.") parser.add_argument( "outfile", nargs="?", default="./mkv-output.txt", help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.", ) # optional args: parser.add_argument( "-u", "--url", help="infile is a URL.", action="store_true", ) parser.add_argument( "-d", "--directory", help="infile is a directory. all text files in it and its subdirectories will be used.", action="store_true", ) parser.add_argument( "-P", "--pdf", help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.", action="store_true", ) parser.add_argument( "-s", "--state-size", help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", type=int, default=2, ) parser.add_argument( "-n", "--sentences", help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5, ) parser.add_argument( "-l", "--length", help="set maximum number of characters per sentence.", type=int, ) parser.add_argument( "-o", "--overlap", help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.", type=float, default=0.5, ) parser.add_argument( "-c", "--combine", help="provide an another text file to be combined with the first item.", ) parser.add_argument( "-C", "--combine-url", help="provide a URL to be combined with the first item." ) parser.add_argument( "-K", "--combine-pdf", help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.", ) parser.add_argument( "-w", "--weight", help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", type=float, default=1, ) parser.add_argument( "-f", "--well-formed", help="enforce 'well_formed': discard sentences containing []{}()" "'' from the markov model. use if output is filthy.", action="store_true", ) # store_true = default to False. parser.add_argument( "--newline", help="sentences in input file end with newlines rather than full stops.", action="store_true", ) # store_true = default to False, True if flagged. parser.add_argument( "-t", "--timestamp", help="add date and time to the file before the output.", action="store_true", ) parser.add_argument( "-p", "--save-options", help="add a brief summary of options used before the output.", action="store_true", ) return parser.parse_args() # make args avail: args = parse_the_args() def main(): # get raw text as a string for infile and -c/C if exists: # infile is url: if args.url: html = url(args.infile) text = convert_html(html) # infile is dir: elif args.directory: matchlist = dir_list(args.infile) # place batchfile.txt in user-given directory: batchfile = args.infile + os.path.sep + "batchfile.txt" dir_cat(matchlist, batchfile) text = read(batchfile) os.unlink(batchfile) # infile is pdf: elif args.pdf: text = convert_pdf(args.infile) # or normal: else: text = read(args.infile) if args.combine: ctext = read(args.combine) if args.combine_url: html = url(args.combine_url) ctext = convert_html(html) if args.combine_pdf: ctext = convert_pdf(args.combine_pdf) # build combined model: if args.combine or args.combine_url or args.combine_pdf: # with --newline: if args.newline: text_model = mkbnewline(text, args.state_size, args.well_formed) ctext_model = mkbnewline(ctext, args.state_size, args.well_formed) # no --newline: else: text_model = mkbtext(text, args.state_size, args.well_formed) ctext_model = mkbtext(ctext, args.state_size, args.well_formed) combo_model = markovify.combine([text_model, ctext_model], [1, args.weight]) # build normal model: else: # with --newline: if args.newline: text_model = mkbnewline(text, args.state_size, args.well_formed) # no --newline: else: text_model = mkbtext(text, args.state_size, args.well_formed) # prepare to write: if args.combine or args.combine_url or args.combine_pdf: model = combo_model else: model = text_model if args.length: write = writeshortsentence else: write = writesentence # print optional headers in file: with open(args.outfile, "a") as outp: # optional print timestamp header: if args.timestamp: outp.write(str(datetime.datetime.now()) + ":\n") # optional print options used header: if args.save_options: outp.write("in: " + vars(args)["infile"] + " | ") if args.combine: outp.write("comb: " + vars(args)["combine"] + " | ") if args.combine_url: outp.write("comb: " + vars(args)["combine_url"] + " | ") if args.combine_pdf: outp.write("comb: " + vars(args)["combine_pdf"] + " | ") if args.combine or args.combine_url or args.combine_pdf: outp.write("weight: " + str(vars(args)["weight"]) + " | ") outp.write("overlap: " + str(vars(args)["overlap"]) + " | ") outp.write("state size: " + str(vars(args)["state_size"]) + "\n") outp.write("\n") # write it! write(model, args.sentences, args.outfile, args.overlap, args.length) # wrap up: print("\n: :\n") for key, value in vars(args).items(): print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10)) if os.path.isfile(args.outfile): print( "\n: literary genius has been written to the file '" + args.outfile + "'. thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'" ) else: print( ": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!" ) sys.exit()