add combine_pdf, merged all the if branches into one

master
mousebot 3 years ago
parent 1ed0760f7e
commit 2dcbe7149c

@ -2,7 +2,7 @@
`mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means!
the rest of these notes are for end users rather than programmers.
the rest of these notes are for end users.
### mkv-this
@ -53,9 +53,13 @@ if for some reason you want to concatenate some files yourself, you can easily d
* run `cat * > outputfile.txt`
* run mkv-this on the file: `mkv-this outputfile.txt`
### pdfs
since 0.2.3, `mkv-this` can take pdfs as input. but to do this you first need to download `pdfminer.six` with `pip`. because of its size, `pdfminer` is not installed by default with `mkv-this`. converting pdfs like this is not fast, and `mkv-this` must convert the pdf each time. so if you envisage using a large pdf many times, you would be better off converting it to plain text yourself.
### file types
you need to input plain text files. currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
for directories of text files, the currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos.
@ -80,6 +84,5 @@ i know nothing about macs so if you ask me for help i'll just send you random co
### todo
* hook it up to a web-scraper.
* hook it up to pdfs.
* option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time.
* learn how to programme.

@ -15,7 +15,7 @@ from io import StringIO
fnf = ": error: file not found. please provide a path to a really-existing file!"
def URL(insert):
def url(insert):
""" fetch a webpage, return it as html """
try:
req = requests.get(insert)
@ -23,10 +23,10 @@ def URL(insert):
req.encoding = req.apparent_encoding
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
print(f": there was trouble: {exc}.\n: please enter a valid url.")
sys.exit()
else:
print(": fetched HTML from URL.")
print(": fetched html from url.")
return req.text
@ -42,21 +42,27 @@ def convert_html(html):
h2t.escape_all = False # remove all noise if needed
s = h2t.handle(html)
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
print(": URL converted to plain text")
print(": html converted to plain text")
return s
def read(infile):
""" read a file so its ready for markov """
try:
with open(infile, encoding="utf-8") as f:
return f.read()
if infile.lower().endswith(".pdf"):
print(
"looks like you entered a pdf file. you need to use the '-P' flag to convert it."
)
sys.exit()
else:
with open(infile, encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(infile, encoding="latin-1") as f:
return f.read()
except IsADirectoryError as exc:
print(
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that."
)
sys.exit()
except FileNotFoundError:
@ -138,8 +144,8 @@ def dir_cat(matchlist, bulkfile):
# extract full text from a pdf:
def convert_pdf_to_txt(path):
print("converting pdf file...")
def convert_pdf(path):
print(": converting pdf file...")
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
@ -170,7 +176,7 @@ def convert_pdf_to_txt(path):
retstr.close()
except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
print(f": there was trouble: {exc}.\n: please enter a valid pdf")
sys.exit()
else:
print(": pdf converted.")

@ -1,7 +1,7 @@
#! /usr/bin/env python3
"""
mkv-this: input a text file, directory, and/or url, output markovified text.
mkv-this: input a text file, directory, url and/or pdf, output markovified text.
Copyright (C) 2020 martianhiatus@riseup.net.
@ -26,7 +26,7 @@ import sys
import datetime
import argparse
from .functions import (
URL,
url,
convert_html,
dir_list,
dir_cat,
@ -55,7 +55,7 @@ def parse_the_args():
)
# optional args:
parser.add_argument(
"-u", "--URL", help="infile is a URL.", action="store_true",
"-u", "--url", help="infile is a URL.", action="store_true",
)
parser.add_argument(
"-d",
@ -66,7 +66,7 @@ def parse_the_args():
parser.add_argument(
"-P",
"--pdf",
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
action="store_true",
)
parser.add_argument(
@ -102,7 +102,12 @@ def parse_the_args():
help="provide an another text file to be combined with the first item.",
)
parser.add_argument(
"-C", "--combine-URL", help="provide a URL to be combined with the first item."
"-C", "--combine-url", help="provide a URL to be combined with the first item."
)
parser.add_argument(
"-K",
"--combine-pdf",
help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
)
parser.add_argument(
"-w",
@ -146,55 +151,36 @@ args = parse_the_args()
def main():
# if a -c/-C, combine it w infile/URL:
if args.combine or args.combine_URL:
if args.combine:
# get raw text as a string for both:
# infile is URL:
if args.URL:
html = URL(args.infile)
text = convert_html(html)
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
# infile is pdf:
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
# read -c file:
ctext = read(args.combine)
# if -C, combine it w infile/URL:
elif args.combine_URL:
# infile is URL:
if args.URL:
html = URL(args.infile)
text = convert_html(html)
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in args.infile:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
# now combine_URL:
html = URL(args.combine_URL)
ctext = convert_html(html)
# build the models + a combined model:
# get raw text as a string for infile and -c/C if exists:
# infile is url:
if args.url:
html = url(args.infile)
text = convert_html(html)
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
# infile is pdf:
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
if args.combine:
ctext = read(args.combine)
if args.combine_url:
html = url(args.combine_url)
ctext = convert_html(html)
if args.combine_pdf:
ctext = convert_pdf(args.combine_pdf)
# build combined model:
if args.combine or args.combine_url or args.combine_pdf:
# with --newline:
if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed)
@ -203,39 +189,18 @@ def main():
else:
text_model = mkbtext(text, args.state_size, args.well_formed)
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
# if no -c/-C, do normal:
# build normal model:
else:
# Get raw text as string.
# either URL:
if args.URL:
html = URL(args.infile)
text = convert_html(html)
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
elif args.pdf:
text = convert_pdf_to_txt(args.infile)
# or local file:
else:
text = read(args.infile)
# Build the model:
# if --newline:
# with --newline:
if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed)
# no --newline:
else:
text_model = mkbtext(text, args.state_size, args.well_formed)
# merge the strains to prepare to write:
if args.combine or args.combine_URL:
# prepare to write:
if args.combine or args.combine_url or args.combine_pdf:
model = combo_model
else:
model = text_model
@ -244,7 +209,7 @@ def main():
else:
write = writesentence
# optional headers in file:
# print optional headers in file:
with open(args.outfile, "a") as outp:
# optional print timestamp header:
if args.timestamp:
@ -254,9 +219,11 @@ def main():
outp.write("in: " + vars(args)["infile"] + " | ")
if args.combine:
outp.write("comb: " + vars(args)["combine"] + " | ")
if args.combine_URL:
outp.write("comb: " + vars(args)["combine_URL"] + " | ")
if args.combine or args.combine_URL:
if args.combine_url:
outp.write("comb: " + vars(args)["combine_url"] + " | ")
if args.combine_pdf:
outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
if args.combine or args.combine_url or args.combine_pdf:
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
@ -265,7 +232,7 @@ def main():
# write it!
write(model, args.sentences, args.outfile, args.overlap, args.length)
# wrapping up:
# wrap up:
print("\n: :\n")
for key, value in vars(args).items():
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))

@ -9,8 +9,8 @@ with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
setup(
name="mkv-this",
version="0.2.2",
description="cli wrapper for markovify: take a text file or URL, markovify, save the results.",
version="0.2.3",
description="cli wrapper for markovify: take a text file, directory, pdf or url, markovify, save the results.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://git.disroot.org/mousebot/mkv-this",
@ -24,6 +24,6 @@ setup(
"mkv-this-dir = mkv_this.mkv_this_dir:main",
]
},
install_requires=["markovify", "argparse", "html2text", "requests",],
install_requires=["markovify", "argparse", "html2text", "requests", "pdfminer",],
zip_safe=False,
)

Loading…
Cancel
Save