mkv-this/mkv_this/mkv_this.py

251 lines
8.7 KiB
Python
Executable File

#! /usr/bin/env python3
"""
mkv-this: input a text file, directory, url and/or pdf, output markovified text.
Copyright (C) 2020 martianhiatus@riseup.net.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import markovify
import os
import sys
import datetime
import argparse
from .functions import (
url,
convert_html,
dir_list,
dir_cat,
read,
mkbtext,
mkbnewline,
writesentence,
writeshortsentence,
convert_pdf,
)
# argparse
def parse_the_args():
parser = argparse.ArgumentParser(
prog="mkv-this",
description="markovify local text files, directory, or URLs and output the results to a local text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.\n '`mkv-this` is a waste product of machine—machine interactions become the historical record.'",
)
# positional args:
parser.add_argument("infile", help="the text file to process.")
parser.add_argument(
"outfile",
nargs="?",
default="./mkv-output.txt",
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.",
)
# optional args:
parser.add_argument(
"-u", "--url", help="infile is a URL.", action="store_true",
)
parser.add_argument(
"-d",
"--directory",
help="infile is a directory. all text files in it and its subdirectories will be used.",
action="store_true",
)
parser.add_argument(
"-P",
"--pdf",
help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
action="store_true",
)
parser.add_argument(
"-s",
"--state-size",
help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
type=int,
default=2,
)
parser.add_argument(
"-n",
"--sentences",
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
type=int,
default=5,
)
parser.add_argument(
"-l",
"--length",
help="set maximum number of characters per sentence.",
type=int,
)
parser.add_argument(
"-o",
"--overlap",
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.",
type=float,
default=0.5,
)
parser.add_argument(
"-c",
"--combine",
help="provide an another text file to be combined with the first item.",
)
parser.add_argument(
"-C", "--combine-url", help="provide a URL to be combined with the first item."
)
parser.add_argument(
"-K",
"--combine-pdf",
help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
)
parser.add_argument(
"-w",
"--weight",
help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
type=float,
default=1,
)
parser.add_argument(
"-f",
"--well-formed",
help="enforce 'well_formed': discard sentences containing []{}()"
"'' from the markov model. use if output is filthy.",
action="store_true",
)
# store_true = default to False.
parser.add_argument(
"--newline",
help="sentences in input file end with newlines rather than full stops.",
action="store_true",
)
# store_true = default to False, True if flagged.
parser.add_argument(
"-t",
"--timestamp",
help="add date and time to the file before the output.",
action="store_true",
)
parser.add_argument(
"-p",
"--save-options",
help="add a brief summary of options used before the output.",
action="store_true",
)
return parser.parse_args()
# make args avail:
args = parse_the_args()
def main():
# get raw text as a string for infile and -c/C if exists:
# infile is url:
if args.url:
html = url(args.infile)
text = convert_html(html)
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
# infile is pdf:
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
if args.combine:
ctext = read(args.combine)
if args.combine_url:
html = url(args.combine_url)
ctext = convert_html(html)
if args.combine_pdf:
ctext = convert_pdf(args.combine_pdf)
# build combined model:
if args.combine or args.combine_url or args.combine_pdf:
# with --newline:
if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed)
ctext_model = mkbnewline(ctext, args.state_size, args.well_formed)
# no --newline:
else:
text_model = mkbtext(text, args.state_size, args.well_formed)
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
# build normal model:
else:
# with --newline:
if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed)
# no --newline:
else:
text_model = mkbtext(text, args.state_size, args.well_formed)
# prepare to write:
if args.combine or args.combine_url or args.combine_pdf:
model = combo_model
else:
model = text_model
if args.length:
write = writeshortsentence
else:
write = writesentence
# print optional headers in file:
with open(args.outfile, "a") as outp:
# optional print timestamp header:
if args.timestamp:
outp.write(str(datetime.datetime.now()) + ":\n")
# optional print options used header:
if args.save_options:
outp.write("in: " + vars(args)["infile"] + " | ")
if args.combine:
outp.write("comb: " + vars(args)["combine"] + " | ")
if args.combine_url:
outp.write("comb: " + vars(args)["combine_url"] + " | ")
if args.combine_pdf:
outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
if args.combine or args.combine_url or args.combine_pdf:
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
outp.write("\n")
# write it!
write(model, args.sentences, args.outfile, args.overlap, args.length)
# wrap up:
print("\n: :\n")
for key, value in vars(args).items():
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
if os.path.isfile(args.outfile):
print(
"\n: literary genius has been written to the file '"
+ args.outfile
+ "'. thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'"
)
else:
print(
": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!"
)
sys.exit()