add combine_pdf, merged all the if branches into one
This commit is contained in:
parent
1ed0760f7e
commit
2dcbe7149c
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
`mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means!
|
`mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means!
|
||||||
|
|
||||||
the rest of these notes are for end users rather than programmers.
|
the rest of these notes are for end users.
|
||||||
|
|
||||||
### mkv-this
|
### mkv-this
|
||||||
|
|
||||||
|
@ -53,9 +53,13 @@ if for some reason you want to concatenate some files yourself, you can easily d
|
||||||
* run `cat * > outputfile.txt`
|
* run `cat * > outputfile.txt`
|
||||||
* run mkv-this on the file: `mkv-this outputfile.txt`
|
* run mkv-this on the file: `mkv-this outputfile.txt`
|
||||||
|
|
||||||
|
### pdfs
|
||||||
|
|
||||||
|
since 0.2.3, `mkv-this` can take pdfs as input. but to do this you first need to download `pdfminer.six` with `pip`. because of its size, `pdfminer` is not installed by default with `mkv-this`. converting pdfs like this is not fast, and `mkv-this` must convert the pdf each time. so if you envisage using a large pdf many times, you would be better off converting it to plain text yourself.
|
||||||
|
|
||||||
### file types
|
### file types
|
||||||
|
|
||||||
you need to input plain text files. currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
|
for directories of text files, the currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
|
||||||
|
|
||||||
if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos.
|
if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos.
|
||||||
|
|
||||||
|
@ -80,6 +84,5 @@ i know nothing about macs so if you ask me for help i'll just send you random co
|
||||||
### todo
|
### todo
|
||||||
|
|
||||||
* hook it up to a web-scraper.
|
* hook it up to a web-scraper.
|
||||||
* hook it up to pdfs.
|
|
||||||
* option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time.
|
* option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time.
|
||||||
* learn how to programme.
|
* learn how to programme.
|
||||||
|
|
|
@ -15,7 +15,7 @@ from io import StringIO
|
||||||
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
||||||
|
|
||||||
|
|
||||||
def URL(insert):
|
def url(insert):
|
||||||
""" fetch a webpage, return it as html """
|
""" fetch a webpage, return it as html """
|
||||||
try:
|
try:
|
||||||
req = requests.get(insert)
|
req = requests.get(insert)
|
||||||
|
@ -23,10 +23,10 @@ def URL(insert):
|
||||||
req.encoding = req.apparent_encoding
|
req.encoding = req.apparent_encoding
|
||||||
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
|
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
|
print(f": there was trouble: {exc}.\n: please enter a valid url.")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
else:
|
else:
|
||||||
print(": fetched HTML from URL.")
|
print(": fetched html from url.")
|
||||||
return req.text
|
return req.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,21 +42,27 @@ def convert_html(html):
|
||||||
h2t.escape_all = False # remove all noise if needed
|
h2t.escape_all = False # remove all noise if needed
|
||||||
s = h2t.handle(html)
|
s = h2t.handle(html)
|
||||||
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
|
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
|
||||||
print(": URL converted to plain text")
|
print(": html converted to plain text")
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def read(infile):
|
def read(infile):
|
||||||
""" read a file so its ready for markov """
|
""" read a file so its ready for markov """
|
||||||
try:
|
try:
|
||||||
with open(infile, encoding="utf-8") as f:
|
if infile.lower().endswith(".pdf"):
|
||||||
return f.read()
|
print(
|
||||||
|
"looks like you entered a pdf file. you need to use the '-P' flag to convert it."
|
||||||
|
)
|
||||||
|
sys.exit()
|
||||||
|
else:
|
||||||
|
with open(infile, encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
with open(infile, encoding="latin-1") as f:
|
with open(infile, encoding="latin-1") as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
except IsADirectoryError as exc:
|
except IsADirectoryError as exc:
|
||||||
print(
|
print(
|
||||||
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
|
f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that."
|
||||||
)
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
@ -138,8 +144,8 @@ def dir_cat(matchlist, bulkfile):
|
||||||
|
|
||||||
|
|
||||||
# extract full text from a pdf:
|
# extract full text from a pdf:
|
||||||
def convert_pdf_to_txt(path):
|
def convert_pdf(path):
|
||||||
print("converting pdf file...")
|
print(": converting pdf file...")
|
||||||
try:
|
try:
|
||||||
rsrcmgr = PDFResourceManager()
|
rsrcmgr = PDFResourceManager()
|
||||||
retstr = StringIO()
|
retstr = StringIO()
|
||||||
|
@ -170,7 +176,7 @@ def convert_pdf_to_txt(path):
|
||||||
retstr.close()
|
retstr.close()
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
|
print(f": there was trouble: {exc}.\n: please enter a valid pdf")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
else:
|
else:
|
||||||
print(": pdf converted.")
|
print(": pdf converted.")
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#! /usr/bin/env python3
|
#! /usr/bin/env python3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
mkv-this: input a text file, directory, and/or url, output markovified text.
|
mkv-this: input a text file, directory, url and/or pdf, output markovified text.
|
||||||
|
|
||||||
Copyright (C) 2020 martianhiatus@riseup.net.
|
Copyright (C) 2020 martianhiatus@riseup.net.
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ import sys
|
||||||
import datetime
|
import datetime
|
||||||
import argparse
|
import argparse
|
||||||
from .functions import (
|
from .functions import (
|
||||||
URL,
|
url,
|
||||||
convert_html,
|
convert_html,
|
||||||
dir_list,
|
dir_list,
|
||||||
dir_cat,
|
dir_cat,
|
||||||
|
@ -55,7 +55,7 @@ def parse_the_args():
|
||||||
)
|
)
|
||||||
# optional args:
|
# optional args:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-u", "--URL", help="infile is a URL.", action="store_true",
|
"-u", "--url", help="infile is a URL.", action="store_true",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
|
@ -66,7 +66,7 @@ def parse_the_args():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-P",
|
"-P",
|
||||||
"--pdf",
|
"--pdf",
|
||||||
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
|
help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -102,7 +102,12 @@ def parse_the_args():
|
||||||
help="provide an another text file to be combined with the first item.",
|
help="provide an another text file to be combined with the first item.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item."
|
"-C", "--combine-url", help="provide a URL to be combined with the first item."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-K",
|
||||||
|
"--combine-pdf",
|
||||||
|
help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-w",
|
"-w",
|
||||||
|
@ -146,55 +151,36 @@ args = parse_the_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# if a -c/-C, combine it w infile/URL:
|
# get raw text as a string for infile and -c/C if exists:
|
||||||
if args.combine or args.combine_URL:
|
# infile is url:
|
||||||
if args.combine:
|
if args.url:
|
||||||
# get raw text as a string for both:
|
html = url(args.infile)
|
||||||
# infile is URL:
|
text = convert_html(html)
|
||||||
if args.URL:
|
# infile is dir:
|
||||||
html = URL(args.infile)
|
elif args.directory:
|
||||||
text = convert_html(html)
|
matchlist = dir_list(args.infile)
|
||||||
# infile is dir:
|
# place batchfile.txt in user-given directory:
|
||||||
elif args.directory:
|
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
||||||
matchlist = dir_list(args.infile)
|
dir_cat(matchlist, batchfile)
|
||||||
# place batchfile.txt in user-given directory:
|
text = read(batchfile)
|
||||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
os.unlink(batchfile)
|
||||||
dir_cat(matchlist, batchfile)
|
# infile is pdf:
|
||||||
text = read(batchfile)
|
elif args.pdf:
|
||||||
os.unlink(batchfile)
|
text = convert_pdf(args.infile)
|
||||||
# infile is pdf:
|
# or normal:
|
||||||
elif args.pdf:
|
else:
|
||||||
text = convert_pdf(args.infile)
|
text = read(args.infile)
|
||||||
# or normal:
|
|
||||||
else:
|
|
||||||
text = read(args.infile)
|
|
||||||
# read -c file:
|
|
||||||
ctext = read(args.combine)
|
|
||||||
|
|
||||||
# if -C, combine it w infile/URL:
|
if args.combine:
|
||||||
elif args.combine_URL:
|
ctext = read(args.combine)
|
||||||
# infile is URL:
|
if args.combine_url:
|
||||||
if args.URL:
|
html = url(args.combine_url)
|
||||||
html = URL(args.infile)
|
ctext = convert_html(html)
|
||||||
text = convert_html(html)
|
if args.combine_pdf:
|
||||||
# infile is dir:
|
ctext = convert_pdf(args.combine_pdf)
|
||||||
elif args.directory:
|
|
||||||
matchlist = dir_list(args.infile)
|
|
||||||
# place batchfile.txt in args.infile:
|
|
||||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
|
||||||
dir_cat(matchlist, batchfile)
|
|
||||||
text = read(batchfile)
|
|
||||||
os.unlink(batchfile)
|
|
||||||
elif args.pdf:
|
|
||||||
text = convert_pdf(args.infile)
|
|
||||||
# or normal:
|
|
||||||
else:
|
|
||||||
text = read(args.infile)
|
|
||||||
# now combine_URL:
|
|
||||||
html = URL(args.combine_URL)
|
|
||||||
ctext = convert_html(html)
|
|
||||||
|
|
||||||
# build the models + a combined model:
|
# build combined model:
|
||||||
|
if args.combine or args.combine_url or args.combine_pdf:
|
||||||
# with --newline:
|
# with --newline:
|
||||||
if args.newline:
|
if args.newline:
|
||||||
text_model = mkbnewline(text, args.state_size, args.well_formed)
|
text_model = mkbnewline(text, args.state_size, args.well_formed)
|
||||||
|
@ -203,39 +189,18 @@ def main():
|
||||||
else:
|
else:
|
||||||
text_model = mkbtext(text, args.state_size, args.well_formed)
|
text_model = mkbtext(text, args.state_size, args.well_formed)
|
||||||
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
|
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
|
||||||
|
|
||||||
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
|
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
|
||||||
|
# build normal model:
|
||||||
# if no -c/-C, do normal:
|
|
||||||
else:
|
else:
|
||||||
# Get raw text as string.
|
# with --newline:
|
||||||
# either URL:
|
|
||||||
if args.URL:
|
|
||||||
html = URL(args.infile)
|
|
||||||
text = convert_html(html)
|
|
||||||
elif args.directory:
|
|
||||||
matchlist = dir_list(args.infile)
|
|
||||||
# place batchfile.txt in user-given directory:
|
|
||||||
batchfile = args.infile + os.path.sep + "batchfile.txt"
|
|
||||||
dir_cat(matchlist, batchfile)
|
|
||||||
text = read(batchfile)
|
|
||||||
os.unlink(batchfile)
|
|
||||||
elif args.pdf:
|
|
||||||
text = convert_pdf_to_txt(args.infile)
|
|
||||||
# or local file:
|
|
||||||
else:
|
|
||||||
text = read(args.infile)
|
|
||||||
|
|
||||||
# Build the model:
|
|
||||||
# if --newline:
|
|
||||||
if args.newline:
|
if args.newline:
|
||||||
text_model = mkbnewline(text, args.state_size, args.well_formed)
|
text_model = mkbnewline(text, args.state_size, args.well_formed)
|
||||||
# no --newline:
|
# no --newline:
|
||||||
else:
|
else:
|
||||||
text_model = mkbtext(text, args.state_size, args.well_formed)
|
text_model = mkbtext(text, args.state_size, args.well_formed)
|
||||||
|
|
||||||
# merge the strains to prepare to write:
|
# prepare to write:
|
||||||
if args.combine or args.combine_URL:
|
if args.combine or args.combine_url or args.combine_pdf:
|
||||||
model = combo_model
|
model = combo_model
|
||||||
else:
|
else:
|
||||||
model = text_model
|
model = text_model
|
||||||
|
@ -244,7 +209,7 @@ def main():
|
||||||
else:
|
else:
|
||||||
write = writesentence
|
write = writesentence
|
||||||
|
|
||||||
# optional headers in file:
|
# print optional headers in file:
|
||||||
with open(args.outfile, "a") as outp:
|
with open(args.outfile, "a") as outp:
|
||||||
# optional print timestamp header:
|
# optional print timestamp header:
|
||||||
if args.timestamp:
|
if args.timestamp:
|
||||||
|
@ -254,9 +219,11 @@ def main():
|
||||||
outp.write("in: " + vars(args)["infile"] + " | ")
|
outp.write("in: " + vars(args)["infile"] + " | ")
|
||||||
if args.combine:
|
if args.combine:
|
||||||
outp.write("comb: " + vars(args)["combine"] + " | ")
|
outp.write("comb: " + vars(args)["combine"] + " | ")
|
||||||
if args.combine_URL:
|
if args.combine_url:
|
||||||
outp.write("comb: " + vars(args)["combine_URL"] + " | ")
|
outp.write("comb: " + vars(args)["combine_url"] + " | ")
|
||||||
if args.combine or args.combine_URL:
|
if args.combine_pdf:
|
||||||
|
outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
|
||||||
|
if args.combine or args.combine_url or args.combine_pdf:
|
||||||
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
|
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
|
||||||
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
|
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
|
||||||
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
|
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
|
||||||
|
@ -265,7 +232,7 @@ def main():
|
||||||
# write it!
|
# write it!
|
||||||
write(model, args.sentences, args.outfile, args.overlap, args.length)
|
write(model, args.sentences, args.outfile, args.overlap, args.length)
|
||||||
|
|
||||||
# wrapping up:
|
# wrap up:
|
||||||
print("\n: :\n")
|
print("\n: :\n")
|
||||||
for key, value in vars(args).items():
|
for key, value in vars(args).items():
|
||||||
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
|
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
|
||||||
|
|
6
setup.py
6
setup.py
|
@ -9,8 +9,8 @@ with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="mkv-this",
|
name="mkv-this",
|
||||||
version="0.2.2",
|
version="0.2.3",
|
||||||
description="cli wrapper for markovify: take a text file or URL, markovify, save the results.",
|
description="cli wrapper for markovify: take a text file, directory, pdf or url, markovify, save the results.",
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
url="https://git.disroot.org/mousebot/mkv-this",
|
url="https://git.disroot.org/mousebot/mkv-this",
|
||||||
|
@ -24,6 +24,6 @@ setup(
|
||||||
"mkv-this-dir = mkv_this.mkv_this_dir:main",
|
"mkv-this-dir = mkv_this.mkv_this_dir:main",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
install_requires=["markovify", "argparse", "html2text", "requests",],
|
install_requires=["markovify", "argparse", "html2text", "requests", "pdfminer",],
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue