mkv-this/mkv_this/mkv_this.py

251 lines
8.7 KiB
Python
Raw Normal View History

2020-04-19 19:57:13 +02:00
#! /usr/bin/env python3
2020-04-24 03:56:39 +02:00
2020-04-19 19:57:13 +02:00
"""
mkv-this: input a text file, directory, url and/or pdf, output markovified text.
Copyright (C) 2020 martianhiatus@riseup.net.
2020-04-19 19:57:13 +02:00
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
2020-04-26 01:57:16 +02:00
import markovify
import os
import sys
2020-04-29 02:57:38 +02:00
import datetime
import argparse
2020-04-26 22:04:23 +02:00
from .functions import (
url,
2020-04-26 22:04:23 +02:00
convert_html,
dir_list,
dir_cat,
read,
mkbtext,
mkbnewline,
writesentence,
writeshortsentence,
2020-04-30 17:13:53 +02:00
convert_pdf,
2020-04-26 22:04:23 +02:00
)
2020-04-24 17:32:27 +02:00
# argparse
def parse_the_args():
parser = argparse.ArgumentParser(
prog="mkv-this",
2020-04-29 02:57:38 +02:00
description="markovify local text files, directory, or URLs and output the results to a local text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.\n '`mkv-this` is a waste product of machine—machine interactions become the historical record.'",
)
2020-04-19 19:57:13 +02:00
# positional args:
2020-04-29 02:57:38 +02:00
parser.add_argument("infile", help="the text file to process.")
parser.add_argument(
2020-04-26 22:04:23 +02:00
"outfile",
nargs="?",
default="./mkv-output.txt",
2020-04-26 22:04:23 +02:00
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.",
)
2020-04-19 19:57:13 +02:00
# optional args:
parser.add_argument(
"-u", "--url", help="infile is a URL.", action="store_true",
)
parser.add_argument(
2020-04-26 22:13:31 +02:00
"-d",
"--directory",
2020-04-29 02:57:38 +02:00
help="infile is a directory. all text files in it and its subdirectories will be used.",
2020-04-26 22:13:31 +02:00
action="store_true",
)
2020-04-30 17:13:53 +02:00
parser.add_argument(
"-P",
"--pdf",
help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
2020-04-30 17:13:53 +02:00
action="store_true",
)
parser.add_argument(
2020-04-26 22:04:23 +02:00
"-s",
"--state-size",
help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
type=int,
2020-04-26 22:04:23 +02:00
default=2,
)
parser.add_argument(
2020-04-26 22:04:23 +02:00
"-n",
"--sentences",
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
type=int,
2020-04-26 22:04:23 +02:00
default=5,
)
parser.add_argument(
"-l",
"--length",
help="set maximum number of characters per sentence.",
2020-04-26 22:04:23 +02:00
type=int,
)
parser.add_argument(
2020-04-26 22:04:23 +02:00
"-o",
"--overlap",
2020-04-30 17:13:53 +02:00
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.",
2020-04-26 22:04:23 +02:00
type=float,
default=0.5,
)
parser.add_argument(
2020-04-26 22:04:23 +02:00
"-c",
"--combine",
help="provide an another text file to be combined with the first item.",
)
parser.add_argument(
"-C", "--combine-url", help="provide a URL to be combined with the first item."
)
parser.add_argument(
"-K",
"--combine-pdf",
help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
)
parser.add_argument(
2020-04-26 22:04:23 +02:00
"-w",
"--weight",
help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
2020-04-26 22:04:23 +02:00
type=float,
default=1,
)
2020-04-24 17:32:27 +02:00
parser.add_argument(
2020-04-26 22:04:23 +02:00
"-f",
"--well-formed",
help="enforce 'well_formed': discard sentences containing []{}()"
"'' from the markov model. use if output is filthy.",
action="store_true",
)
2020-04-26 22:04:23 +02:00
# store_true = default to False.
parser.add_argument(
2020-04-26 22:04:23 +02:00
"--newline",
2020-04-29 02:57:38 +02:00
help="sentences in input file end with newlines rather than full stops.",
action="store_true",
)
# store_true = default to False, True if flagged.
parser.add_argument(
"-t",
"--timestamp",
help="add date and time to the file before the output.",
action="store_true",
)
parser.add_argument(
"-p",
"--save-options",
help="add a brief summary of options used before the output.",
2020-04-26 22:04:23 +02:00
action="store_true",
)
2020-04-19 19:57:13 +02:00
2020-04-24 17:32:27 +02:00
return parser.parse_args()
2020-04-24 18:37:16 +02:00
# make args avail:
2020-04-24 17:32:27 +02:00
args = parse_the_args()
2020-04-24 17:32:27 +02:00
def main():
# get raw text as a string for infile and -c/C if exists:
# infile is url:
if args.url:
html = url(args.infile)
text = convert_html(html)
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
# infile is pdf:
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
if args.combine:
ctext = read(args.combine)
if args.combine_url:
html = url(args.combine_url)
ctext = convert_html(html)
if args.combine_pdf:
ctext = convert_pdf(args.combine_pdf)
# build combined model:
if args.combine or args.combine_url or args.combine_pdf:
2020-04-24 03:56:39 +02:00
# with --newline:
if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed)
ctext_model = mkbnewline(ctext, args.state_size, args.well_formed)
2020-04-24 03:56:39 +02:00
# no --newline:
else:
text_model = mkbtext(text, args.state_size, args.well_formed)
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
2020-04-26 22:04:23 +02:00
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
# build normal model:
2020-04-19 19:57:13 +02:00
else:
# with --newline:
if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed)
# no --newline:
else:
text_model = mkbtext(text, args.state_size, args.well_formed)
2020-04-24 18:37:16 +02:00
# prepare to write:
if args.combine or args.combine_url or args.combine_pdf:
2020-04-29 02:57:38 +02:00
model = combo_model
else:
model = text_model
if args.length:
write = writeshortsentence
else:
write = writesentence
# print optional headers in file:
2020-04-29 02:57:38 +02:00
with open(args.outfile, "a") as outp:
# optional print timestamp header:
if args.timestamp:
outp.write(str(datetime.datetime.now()) + ":\n")
# optional print options used header:
if args.save_options:
2020-04-30 16:43:01 +02:00
outp.write("in: " + vars(args)["infile"] + " | ")
2020-04-29 02:57:38 +02:00
if args.combine:
outp.write("comb: " + vars(args)["combine"] + " | ")
if args.combine_url:
outp.write("comb: " + vars(args)["combine_url"] + " | ")
if args.combine_pdf:
outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
if args.combine or args.combine_url or args.combine_pdf:
2020-04-29 02:57:38 +02:00
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
2020-04-30 16:43:01 +02:00
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
2020-04-30 17:13:53 +02:00
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
2020-04-29 02:57:38 +02:00
outp.write("\n")
# write it!
write(model, args.sentences, args.outfile, args.overlap, args.length)
2020-04-19 19:57:13 +02:00
# wrap up:
2020-04-26 22:04:23 +02:00
print("\n: :\n")
for key, value in vars(args).items():
2020-04-26 22:04:23 +02:00
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
if os.path.isfile(args.outfile):
2020-04-26 22:04:23 +02:00
print(
2020-04-30 16:43:01 +02:00
"\n: literary genius has been written to the file '"
2020-04-26 22:04:23 +02:00
+ args.outfile
2020-04-30 16:43:01 +02:00
+ "'. thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'"
2020-04-26 22:04:23 +02:00
)
else:
2020-04-26 22:04:23 +02:00
print(
": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!"
)
sys.exit()