add combine_pdf, merged all the if branches into one

This commit is contained in:
mousebot 2020-04-30 13:46:32 -03:00
parent 1ed0760f7e
commit 2dcbe7149c
4 changed files with 74 additions and 98 deletions

View File

@ -2,7 +2,7 @@
`mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means! `mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means!
the rest of these notes are for end users rather than programmers. the rest of these notes are for end users.
### mkv-this ### mkv-this
@ -53,9 +53,13 @@ if for some reason you want to concatenate some files yourself, you can easily d
* run `cat * > outputfile.txt` * run `cat * > outputfile.txt`
* run mkv-this on the file: `mkv-this outputfile.txt` * run mkv-this on the file: `mkv-this outputfile.txt`
### pdfs
since 0.2.3, `mkv-this` can take pdfs as input. but to do this you first need to download `pdfminer.six` with `pip`. because of its size, `pdfminer` is not installed by default with `mkv-this`. converting pdfs like this is not fast, and `mkv-this` must convert the pdf each time. so if you envisage using a large pdf many times, you would be better off converting it to plain text yourself.
### file types ### file types
you need to input plain text files. currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask. for directories of text files, the currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos. if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos.
@ -80,6 +84,5 @@ i know nothing about macs so if you ask me for help i'll just send you random co
### todo ### todo
* hook it up to a web-scraper. * hook it up to a web-scraper.
* hook it up to pdfs.
* option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time. * option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time.
* learn how to programme. * learn how to programme.

View File

@ -15,7 +15,7 @@ from io import StringIO
fnf = ": error: file not found. please provide a path to a really-existing file!" fnf = ": error: file not found. please provide a path to a really-existing file!"
def URL(insert): def url(insert):
""" fetch a webpage, return it as html """ """ fetch a webpage, return it as html """
try: try:
req = requests.get(insert) req = requests.get(insert)
@ -23,10 +23,10 @@ def URL(insert):
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
# use chardet to catch encoding issue with ISO-8859-1/Latin-1. # use chardet to catch encoding issue with ISO-8859-1/Latin-1.
except Exception as exc: except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid URL") print(f": there was trouble: {exc}.\n: please enter a valid url.")
sys.exit() sys.exit()
else: else:
print(": fetched HTML from URL.") print(": fetched html from url.")
return req.text return req.text
@ -42,21 +42,27 @@ def convert_html(html):
h2t.escape_all = False # remove all noise if needed h2t.escape_all = False # remove all noise if needed
s = h2t.handle(html) s = h2t.handle(html)
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown' s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
print(": URL converted to plain text") print(": html converted to plain text")
return s return s
def read(infile): def read(infile):
""" read a file so its ready for markov """ """ read a file so its ready for markov """
try: try:
with open(infile, encoding="utf-8") as f: if infile.lower().endswith(".pdf"):
return f.read() print(
"looks like you entered a pdf file. you need to use the '-P' flag to convert it."
)
sys.exit()
else:
with open(infile, encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError: except UnicodeDecodeError:
with open(infile, encoding="latin-1") as f: with open(infile, encoding="latin-1") as f:
return f.read() return f.read()
except IsADirectoryError as exc: except IsADirectoryError as exc:
print( print(
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that." f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that."
) )
sys.exit() sys.exit()
except FileNotFoundError: except FileNotFoundError:
@ -138,8 +144,8 @@ def dir_cat(matchlist, bulkfile):
# extract full text from a pdf: # extract full text from a pdf:
def convert_pdf_to_txt(path): def convert_pdf(path):
print("converting pdf file...") print(": converting pdf file...")
try: try:
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
retstr = StringIO() retstr = StringIO()
@ -170,7 +176,7 @@ def convert_pdf_to_txt(path):
retstr.close() retstr.close()
except Exception as exc: except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid pdf") print(f": there was trouble: {exc}.\n: please enter a valid pdf")
sys.exit() sys.exit()
else: else:
print(": pdf converted.") print(": pdf converted.")

View File

@ -1,7 +1,7 @@
#! /usr/bin/env python3 #! /usr/bin/env python3
""" """
mkv-this: input a text file, directory, and/or url, output markovified text. mkv-this: input a text file, directory, url and/or pdf, output markovified text.
Copyright (C) 2020 martianhiatus@riseup.net. Copyright (C) 2020 martianhiatus@riseup.net.
@ -26,7 +26,7 @@ import sys
import datetime import datetime
import argparse import argparse
from .functions import ( from .functions import (
URL, url,
convert_html, convert_html,
dir_list, dir_list,
dir_cat, dir_cat,
@ -55,7 +55,7 @@ def parse_the_args():
) )
# optional args: # optional args:
parser.add_argument( parser.add_argument(
"-u", "--URL", help="infile is a URL.", action="store_true", "-u", "--url", help="infile is a URL.", action="store_true",
) )
parser.add_argument( parser.add_argument(
"-d", "-d",
@ -66,7 +66,7 @@ def parse_the_args():
parser.add_argument( parser.add_argument(
"-P", "-P",
"--pdf", "--pdf",
help="infile is a pdf. NB: for this to work you need to install pdfminer.", help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
action="store_true", action="store_true",
) )
parser.add_argument( parser.add_argument(
@ -102,7 +102,12 @@ def parse_the_args():
help="provide an another text file to be combined with the first item.", help="provide an another text file to be combined with the first item.",
) )
parser.add_argument( parser.add_argument(
"-C", "--combine-URL", help="provide a URL to be combined with the first item." "-C", "--combine-url", help="provide a URL to be combined with the first item."
)
parser.add_argument(
"-K",
"--combine-pdf",
help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
) )
parser.add_argument( parser.add_argument(
"-w", "-w",
@ -146,55 +151,36 @@ args = parse_the_args()
def main(): def main():
# if a -c/-C, combine it w infile/URL: # get raw text as a string for infile and -c/C if exists:
if args.combine or args.combine_URL: # infile is url:
if args.combine: if args.url:
# get raw text as a string for both: html = url(args.infile)
# infile is URL: text = convert_html(html)
if args.URL: # infile is dir:
html = URL(args.infile) elif args.directory:
text = convert_html(html) matchlist = dir_list(args.infile)
# infile is dir: # place batchfile.txt in user-given directory:
elif args.directory: batchfile = args.infile + os.path.sep + "batchfile.txt"
matchlist = dir_list(args.infile) dir_cat(matchlist, batchfile)
# place batchfile.txt in user-given directory: text = read(batchfile)
batchfile = args.infile + os.path.sep + "batchfile.txt" os.unlink(batchfile)
dir_cat(matchlist, batchfile) # infile is pdf:
text = read(batchfile) elif args.pdf:
os.unlink(batchfile) text = convert_pdf(args.infile)
# infile is pdf: # or normal:
elif args.pdf: else:
text = convert_pdf(args.infile) text = read(args.infile)
# or normal:
else:
text = read(args.infile)
# read -c file:
ctext = read(args.combine)
# if -C, combine it w infile/URL: if args.combine:
elif args.combine_URL: ctext = read(args.combine)
# infile is URL: if args.combine_url:
if args.URL: html = url(args.combine_url)
html = URL(args.infile) ctext = convert_html(html)
text = convert_html(html) if args.combine_pdf:
# infile is dir: ctext = convert_pdf(args.combine_pdf)
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in args.infile:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
# now combine_URL:
html = URL(args.combine_URL)
ctext = convert_html(html)
# build the models + a combined model: # build combined model:
if args.combine or args.combine_url or args.combine_pdf:
# with --newline: # with --newline:
if args.newline: if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed) text_model = mkbnewline(text, args.state_size, args.well_formed)
@ -203,39 +189,18 @@ def main():
else: else:
text_model = mkbtext(text, args.state_size, args.well_formed) text_model = mkbtext(text, args.state_size, args.well_formed)
ctext_model = mkbtext(ctext, args.state_size, args.well_formed) ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight]) combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
# build normal model:
# if no -c/-C, do normal:
else: else:
# Get raw text as string. # with --newline:
# either URL:
if args.URL:
html = URL(args.infile)
text = convert_html(html)
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = args.infile + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
elif args.pdf:
text = convert_pdf_to_txt(args.infile)
# or local file:
else:
text = read(args.infile)
# Build the model:
# if --newline:
if args.newline: if args.newline:
text_model = mkbnewline(text, args.state_size, args.well_formed) text_model = mkbnewline(text, args.state_size, args.well_formed)
# no --newline: # no --newline:
else: else:
text_model = mkbtext(text, args.state_size, args.well_formed) text_model = mkbtext(text, args.state_size, args.well_formed)
# merge the strains to prepare to write: # prepare to write:
if args.combine or args.combine_URL: if args.combine or args.combine_url or args.combine_pdf:
model = combo_model model = combo_model
else: else:
model = text_model model = text_model
@ -244,7 +209,7 @@ def main():
else: else:
write = writesentence write = writesentence
# optional headers in file: # print optional headers in file:
with open(args.outfile, "a") as outp: with open(args.outfile, "a") as outp:
# optional print timestamp header: # optional print timestamp header:
if args.timestamp: if args.timestamp:
@ -254,9 +219,11 @@ def main():
outp.write("in: " + vars(args)["infile"] + " | ") outp.write("in: " + vars(args)["infile"] + " | ")
if args.combine: if args.combine:
outp.write("comb: " + vars(args)["combine"] + " | ") outp.write("comb: " + vars(args)["combine"] + " | ")
if args.combine_URL: if args.combine_url:
outp.write("comb: " + vars(args)["combine_URL"] + " | ") outp.write("comb: " + vars(args)["combine_url"] + " | ")
if args.combine or args.combine_URL: if args.combine_pdf:
outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
if args.combine or args.combine_url or args.combine_pdf:
outp.write("weight: " + str(vars(args)["weight"]) + " | ") outp.write("weight: " + str(vars(args)["weight"]) + " | ")
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ") outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
outp.write("state size: " + str(vars(args)["state_size"]) + "\n") outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
@ -265,7 +232,7 @@ def main():
# write it! # write it!
write(model, args.sentences, args.outfile, args.overlap, args.length) write(model, args.sentences, args.outfile, args.overlap, args.length)
# wrapping up: # wrap up:
print("\n: :\n") print("\n: :\n")
for key, value in vars(args).items(): for key, value in vars(args).items():
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10)) print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))

View File

@ -9,8 +9,8 @@ with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
setup( setup(
name="mkv-this", name="mkv-this",
version="0.2.2", version="0.2.3",
description="cli wrapper for markovify: take a text file or URL, markovify, save the results.", description="cli wrapper for markovify: take a text file, directory, pdf or url, markovify, save the results.",
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
url="https://git.disroot.org/mousebot/mkv-this", url="https://git.disroot.org/mousebot/mkv-this",
@ -24,6 +24,6 @@ setup(
"mkv-this-dir = mkv_this.mkv_this_dir:main", "mkv-this-dir = mkv_this.mkv_this_dir:main",
] ]
}, },
install_requires=["markovify", "argparse", "html2text", "requests",], install_requires=["markovify", "argparse", "html2text", "requests", "pdfminer",],
zip_safe=False, zip_safe=False,
) )