add combine_pdf, merged all the if branches into one
This commit is contained in:
parent
1ed0760f7e
commit
2dcbe7149c
|
@ -2,7 +2,7 @@
|
|||
|
||||
`mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means!
|
||||
|
||||
the rest of these notes are for end users rather than programmers.
|
||||
the rest of these notes are for end users.
|
||||
|
||||
### mkv-this
|
||||
|
||||
|
@ -53,9 +53,13 @@ if for some reason you want to concatenate some files yourself, you can easily d
|
|||
* run `cat * > outputfile.txt`
|
||||
* run mkv-this on the file: `mkv-this outputfile.txt`
|
||||
|
||||
### pdfs
|
||||
|
||||
since 0.2.3, `mkv-this` can take pdfs as input. but to do this you first need to download `pdfminer.six` with `pip`. because of its size, `pdfminer` is not installed by default with `mkv-this`. converting pdfs like this is not fast, and `mkv-this` must convert the pdf each time. so if you envisage using a large pdf many times, you would be better off converting it to plain text yourself.
|
||||
|
||||
### file types
|
||||
|
||||
you need to input plain text files. currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
|
||||
for directories of text files, the currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
|
||||
|
||||
if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos.
|
||||
|
||||
|
@ -80,6 +84,5 @@ i know nothing about macs so if you ask me for help i'll just send you random co
|
|||
### todo
|
||||
|
||||
* hook it up to a web-scraper.
|
||||
* hook it up to pdfs.
|
||||
* option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time.
|
||||
* learn how to programme.
|
||||
|
|
|
@ -15,7 +15,7 @@ from io import StringIO
|
|||
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
||||
|
||||
|
||||
def URL(insert):
|
||||
def url(insert):
|
||||
""" fetch a webpage, return it as html """
|
||||
try:
|
||||
req = requests.get(insert)
|
||||
|
@ -23,10 +23,10 @@ def URL(insert):
|
|||
req.encoding = req.apparent_encoding
|
||||
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
|
||||
except Exception as exc:
|
||||
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
|
||||
print(f": there was trouble: {exc}.\n: please enter a valid url.")
|
||||
sys.exit()
|
||||
else:
|
||||
print(": fetched HTML from URL.")
|
||||
print(": fetched html from url.")
|
||||
return req.text
|
||||
|
||||
|
||||
|
@ -42,21 +42,27 @@ def convert_html(html):
|
|||
h2t.escape_all = False # remove all noise if needed
|
||||
s = h2t.handle(html)
|
||||
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
|
||||
print(": URL converted to plain text")
|
||||
print(": html converted to plain text")
|
||||
return s
|
||||
|
||||
|
||||
def read(infile):
|
||||
""" read a file so its ready for markov """
|
||||
try:
|
||||
with open(infile, encoding="utf-8") as f:
|
||||
return f.read()
|
||||
if infile.lower().endswith(".pdf"):
|
||||
print(
|
||||
"looks like you entered a pdf file. you need to use the '-P' flag to convert it."
|
||||
)
|
||||
sys.exit()
|
||||
else:
|
||||
with open(infile, encoding="utf-8") as f:
|
||||
return f.read()
|
||||
except UnicodeDecodeError:
|
||||
with open(infile, encoding="latin-1") as f:
|
||||
return f.read()
|
||||
except IsADirectoryError as exc:
|
||||
print(
|
||||
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
|
||||
f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that."
|
||||
)
|
||||
sys.exit()
|
||||
except FileNotFoundError:
|
||||
|
@ -138,8 +144,8 @@ def dir_cat(matchlist, bulkfile):
|
|||
|
||||
|
||||
# extract full text from a pdf:
|
||||
def convert_pdf_to_txt(path):
|
||||
print("converting pdf file...")
|
||||
def convert_pdf(path):
|
||||
print(": converting pdf file...")
|
||||
try:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
retstr = StringIO()
|
||||
|
@ -170,7 +176,7 @@ def convert_pdf_to_txt(path):
|
|||
retstr.close()
|
||||
|
||||
except Exception as exc:
|
||||
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
|
||||
print(f": there was trouble: {exc}.\n: please enter a valid pdf")
|
||||
sys.exit()
|
||||
else:
|
||||
print(": pdf converted.")
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
"""
|
||||
mkv-this: input a text file, directory, and/or url, output markovified text.
|
||||
mkv-this: input a text file, directory, url and/or pdf, output markovified text.
|
||||
|
||||
Copyright (C) 2020 martianhiatus@riseup.net.
|
||||
|
||||
|
@ -26,7 +26,7 @@ import sys
|
|||
import datetime
|
||||
import argparse
|
||||
from .functions import (
|
||||
URL,
|
||||
url,
|
||||
convert_html,
|
||||
dir_list,
|
||||
dir_cat,
|
||||
|
@ -55,7 +55,7 @@ def parse_the_args():
|
|||
)
|
||||
# optional args:
|
||||
parser.add_argument(
|
||||
"-u", "--URL", help="infile is a URL.", action="store_true",
|
||||
"-u", "--url", help="infile is a URL.", action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
|
@ -66,7 +66,7 @@ def parse_the_args():
|
|||
parser.add_argument(
|
||||
"-P",
|
||||
"--pdf",
|
||||
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
|
||||
help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
@ -102,7 +102,12 @@ def parse_the_args():
|
|||
help="provide an another text file to be combined with the first item.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item."
|
||||
"-C", "--combine-url", help="provide a URL to be combined with the first item."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-K",
|
||||
"--combine-pdf",
|
||||
help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w",
|
||||
|
@ -146,55 +151,36 @@ args = parse_the_args()
|
|||
|
||||
|
||||
def main():
|
||||
# if a -c/-C, combine it w infile/URL:
|
||||
if args.combine or args.combine_URL:
|
||||
if args.combine:
|
||||
# get raw text as a string for both:
|
||||
# infile is URL:
|
||||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
# infile is pdf:
|
||||
elif args.pdf:
|
||||
text = convert_pdf(args.infile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
# read -c file:
|
||||
ctext = read(args.combine)
|
||||
# get raw text as a string for infile and -c/C if exists:
|
||||
# infile is url:
|
||||
if args.url:
|
||||
html = url(args.infile)
|
||||
text = convert_html(html)
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
# infile is pdf:
|
||||
elif args.pdf:
|
||||
text = convert_pdf(args.infile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
||||
# if -C, combine it w infile/URL:
|
||||
elif args.combine_URL:
|
||||
# infile is URL:
|
||||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in args.infile:
|
||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
elif args.pdf:
|
||||
text = convert_pdf(args.infile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
# now combine_URL:
|
||||
html = URL(args.combine_URL)
|
||||
ctext = convert_html(html)
|
||||
if args.combine:
|
||||
ctext = read(args.combine)
|
||||
if args.combine_url:
|
||||
html = url(args.combine_url)
|
||||
ctext = convert_html(html)
|
||||
if args.combine_pdf:
|
||||
ctext = convert_pdf(args.combine_pdf)
|
||||
|
||||
# build the models + a combined model:
|
||||
# build combined model:
|
||||
if args.combine or args.combine_url or args.combine_pdf:
|
||||
# with --newline:
|
||||
if args.newline:
|
||||
text_model = mkbnewline(text, args.state_size, args.well_formed)
|
||||
|
@ -203,39 +189,18 @@ def main():
|
|||
else:
|
||||
text_model = mkbtext(text, args.state_size, args.well_formed)
|
||||
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
|
||||
|
||||
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
|
||||
|
||||
# if no -c/-C, do normal:
|
||||
# build normal model:
|
||||
else:
|
||||
# Get raw text as string.
|
||||
# either URL:
|
||||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
elif args.pdf:
|
||||
text = convert_pdf_to_txt(args.infile)
|
||||
# or local file:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
||||
# Build the model:
|
||||
# if --newline:
|
||||
# with --newline:
|
||||
if args.newline:
|
||||
text_model = mkbnewline(text, args.state_size, args.well_formed)
|
||||
# no --newline:
|
||||
else:
|
||||
text_model = mkbtext(text, args.state_size, args.well_formed)
|
||||
|
||||
# merge the strains to prepare to write:
|
||||
if args.combine or args.combine_URL:
|
||||
# prepare to write:
|
||||
if args.combine or args.combine_url or args.combine_pdf:
|
||||
model = combo_model
|
||||
else:
|
||||
model = text_model
|
||||
|
@ -244,7 +209,7 @@ def main():
|
|||
else:
|
||||
write = writesentence
|
||||
|
||||
# optional headers in file:
|
||||
# print optional headers in file:
|
||||
with open(args.outfile, "a") as outp:
|
||||
# optional print timestamp header:
|
||||
if args.timestamp:
|
||||
|
@ -254,9 +219,11 @@ def main():
|
|||
outp.write("in: " + vars(args)["infile"] + " | ")
|
||||
if args.combine:
|
||||
outp.write("comb: " + vars(args)["combine"] + " | ")
|
||||
if args.combine_URL:
|
||||
outp.write("comb: " + vars(args)["combine_URL"] + " | ")
|
||||
if args.combine or args.combine_URL:
|
||||
if args.combine_url:
|
||||
outp.write("comb: " + vars(args)["combine_url"] + " | ")
|
||||
if args.combine_pdf:
|
||||
outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
|
||||
if args.combine or args.combine_url or args.combine_pdf:
|
||||
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
|
||||
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
|
||||
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
|
||||
|
@ -265,7 +232,7 @@ def main():
|
|||
# write it!
|
||||
write(model, args.sentences, args.outfile, args.overlap, args.length)
|
||||
|
||||
# wrapping up:
|
||||
# wrap up:
|
||||
print("\n: :\n")
|
||||
for key, value in vars(args).items():
|
||||
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
|
||||
|
|
6
setup.py
6
setup.py
|
@ -9,8 +9,8 @@ with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
|||
|
||||
setup(
|
||||
name="mkv-this",
|
||||
version="0.2.2",
|
||||
description="cli wrapper for markovify: take a text file or URL, markovify, save the results.",
|
||||
version="0.2.3",
|
||||
description="cli wrapper for markovify: take a text file, directory, pdf or url, markovify, save the results.",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://git.disroot.org/mousebot/mkv-this",
|
||||
|
@ -24,6 +24,6 @@ setup(
|
|||
"mkv-this-dir = mkv_this.mkv_this_dir:main",
|
||||
]
|
||||
},
|
||||
install_requires=["markovify", "argparse", "html2text", "requests",],
|
||||
install_requires=["markovify", "argparse", "html2text", "requests", "pdfminer",],
|
||||
zip_safe=False,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue