From 2dcbe7149c542ecb5ab76a248b3e4e7d4820769a Mon Sep 17 00:00:00 2001 From: mousebot Date: Thu, 30 Apr 2020 13:46:32 -0300 Subject: [PATCH] add combine_pdf, merged all the if branches into one --- README.md | 9 ++- mkv_this/functions.py | 26 +++++---- mkv_this/mkv_this.py | 131 ++++++++++++++++-------------------------- setup.py | 6 +- 4 files changed, 74 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index ece2bed..2e36e9c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ `mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means! -the rest of these notes are for end users rather than programmers. +the rest of these notes are for end users. ### mkv-this @@ -53,9 +53,13 @@ if for some reason you want to concatenate some files yourself, you can easily d * run `cat * > outputfile.txt` * run mkv-this on the file: `mkv-this outputfile.txt` +### pdfs + +since 0.2.3, `mkv-this` can take pdfs as input. but to do this you first need to download `pdfminer.six` with `pip`. because of its size, `pdfminer` is not installed by default with `mkv-this`. converting pdfs like this is not fast, and `mkv-this` must convert the pdf each time. so if you envisage using a large pdf many times, you would be better off converting it to plain text yourself. + ### file types -you need to input plain text files. currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask. +for directories of text files, the currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask. if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos. @@ -80,6 +84,5 @@ i know nothing about macs so if you ask me for help i'll just send you random co ### todo * hook it up to a web-scraper. -* hook it up to pdfs. * option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time. * learn how to programme. diff --git a/mkv_this/functions.py b/mkv_this/functions.py index 7116623..c7b64a4 100644 --- a/mkv_this/functions.py +++ b/mkv_this/functions.py @@ -15,7 +15,7 @@ from io import StringIO fnf = ": error: file not found. please provide a path to a really-existing file!" -def URL(insert): +def url(insert): """ fetch a webpage, return it as html """ try: req = requests.get(insert) @@ -23,10 +23,10 @@ def URL(insert): req.encoding = req.apparent_encoding # use chardet to catch encoding issue with ISO-8859-1/Latin-1. except Exception as exc: - print(f": There was a problem: {exc}.\n: Please enter a valid URL") + print(f": there was trouble: {exc}.\n: please enter a valid url.") sys.exit() else: - print(": fetched HTML from URL.") + print(": fetched html from url.") return req.text @@ -42,21 +42,27 @@ def convert_html(html): h2t.escape_all = False # remove all noise if needed s = h2t.handle(html) s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown' - print(": URL converted to plain text") + print(": html converted to plain text") return s def read(infile): """ read a file so its ready for markov """ try: - with open(infile, encoding="utf-8") as f: - return f.read() + if infile.lower().endswith(".pdf"): + print( + "looks like you entered a pdf file. you need to use the '-P' flag to convert it." + ) + sys.exit() + else: + with open(infile, encoding="utf-8") as f: + return f.read() except UnicodeDecodeError: with open(infile, encoding="latin-1") as f: return f.read() except IsADirectoryError as exc: print( - f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that." + f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that." ) sys.exit() except FileNotFoundError: @@ -138,8 +144,8 @@ def dir_cat(matchlist, bulkfile): # extract full text from a pdf: -def convert_pdf_to_txt(path): - print("converting pdf file...") +def convert_pdf(path): + print(": converting pdf file...") try: rsrcmgr = PDFResourceManager() retstr = StringIO() @@ -170,7 +176,7 @@ def convert_pdf_to_txt(path): retstr.close() except Exception as exc: - print(f": There was a problem: {exc}.\n: Please enter a valid pdf") + print(f": there was trouble: {exc}.\n: please enter a valid pdf") sys.exit() else: print(": pdf converted.") diff --git a/mkv_this/mkv_this.py b/mkv_this/mkv_this.py index 68d27ac..a67a3ae 100755 --- a/mkv_this/mkv_this.py +++ b/mkv_this/mkv_this.py @@ -1,7 +1,7 @@ #! /usr/bin/env python3 """ - mkv-this: input a text file, directory, and/or url, output markovified text. + mkv-this: input a text file, directory, url and/or pdf, output markovified text. Copyright (C) 2020 martianhiatus@riseup.net. @@ -26,7 +26,7 @@ import sys import datetime import argparse from .functions import ( - URL, + url, convert_html, dir_list, dir_cat, @@ -55,7 +55,7 @@ def parse_the_args(): ) # optional args: parser.add_argument( - "-u", "--URL", help="infile is a URL.", action="store_true", + "-u", "--url", help="infile is a URL.", action="store_true", ) parser.add_argument( "-d", @@ -66,7 +66,7 @@ def parse_the_args(): parser.add_argument( "-P", "--pdf", - help="infile is a pdf. NB: for this to work you need to install pdfminer.", + help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.", action="store_true", ) parser.add_argument( @@ -102,7 +102,12 @@ def parse_the_args(): help="provide an another text file to be combined with the first item.", ) parser.add_argument( - "-C", "--combine-URL", help="provide a URL to be combined with the first item." + "-C", "--combine-url", help="provide a URL to be combined with the first item." + ) + parser.add_argument( + "-K", + "--combine-pdf", + help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.", ) parser.add_argument( "-w", @@ -146,55 +151,36 @@ args = parse_the_args() def main(): - # if a -c/-C, combine it w infile/URL: - if args.combine or args.combine_URL: - if args.combine: - # get raw text as a string for both: - # infile is URL: - if args.URL: - html = URL(args.infile) - text = convert_html(html) - # infile is dir: - elif args.directory: - matchlist = dir_list(args.infile) - # place batchfile.txt in user-given directory: - batchfile = args.infile + os.path.sep + "batchfile.txt" - dir_cat(matchlist, batchfile) - text = read(batchfile) - os.unlink(batchfile) - # infile is pdf: - elif args.pdf: - text = convert_pdf(args.infile) - # or normal: - else: - text = read(args.infile) - # read -c file: - ctext = read(args.combine) + # get raw text as a string for infile and -c/C if exists: + # infile is url: + if args.url: + html = url(args.infile) + text = convert_html(html) + # infile is dir: + elif args.directory: + matchlist = dir_list(args.infile) + # place batchfile.txt in user-given directory: + batchfile = args.infile + os.path.sep + "batchfile.txt" + dir_cat(matchlist, batchfile) + text = read(batchfile) + os.unlink(batchfile) + # infile is pdf: + elif args.pdf: + text = convert_pdf(args.infile) + # or normal: + else: + text = read(args.infile) - # if -C, combine it w infile/URL: - elif args.combine_URL: - # infile is URL: - if args.URL: - html = URL(args.infile) - text = convert_html(html) - # infile is dir: - elif args.directory: - matchlist = dir_list(args.infile) - # place batchfile.txt in args.infile: - batchfile = args.infile + os.path.sep + "batchfile.txt" - dir_cat(matchlist, batchfile) - text = read(batchfile) - os.unlink(batchfile) - elif args.pdf: - text = convert_pdf(args.infile) - # or normal: - else: - text = read(args.infile) - # now combine_URL: - html = URL(args.combine_URL) - ctext = convert_html(html) + if args.combine: + ctext = read(args.combine) + if args.combine_url: + html = url(args.combine_url) + ctext = convert_html(html) + if args.combine_pdf: + ctext = convert_pdf(args.combine_pdf) - # build the models + a combined model: + # build combined model: + if args.combine or args.combine_url or args.combine_pdf: # with --newline: if args.newline: text_model = mkbnewline(text, args.state_size, args.well_formed) @@ -203,39 +189,18 @@ def main(): else: text_model = mkbtext(text, args.state_size, args.well_formed) ctext_model = mkbtext(ctext, args.state_size, args.well_formed) - combo_model = markovify.combine([text_model, ctext_model], [1, args.weight]) - - # if no -c/-C, do normal: + # build normal model: else: - # Get raw text as string. - # either URL: - if args.URL: - html = URL(args.infile) - text = convert_html(html) - elif args.directory: - matchlist = dir_list(args.infile) - # place batchfile.txt in user-given directory: - batchfile = args.infile + os.path.sep + "batchfile.txt" - dir_cat(matchlist, batchfile) - text = read(batchfile) - os.unlink(batchfile) - elif args.pdf: - text = convert_pdf_to_txt(args.infile) - # or local file: - else: - text = read(args.infile) - - # Build the model: - # if --newline: + # with --newline: if args.newline: text_model = mkbnewline(text, args.state_size, args.well_formed) # no --newline: else: text_model = mkbtext(text, args.state_size, args.well_formed) - # merge the strains to prepare to write: - if args.combine or args.combine_URL: + # prepare to write: + if args.combine or args.combine_url or args.combine_pdf: model = combo_model else: model = text_model @@ -244,7 +209,7 @@ def main(): else: write = writesentence - # optional headers in file: + # print optional headers in file: with open(args.outfile, "a") as outp: # optional print timestamp header: if args.timestamp: @@ -254,9 +219,11 @@ def main(): outp.write("in: " + vars(args)["infile"] + " | ") if args.combine: outp.write("comb: " + vars(args)["combine"] + " | ") - if args.combine_URL: - outp.write("comb: " + vars(args)["combine_URL"] + " | ") - if args.combine or args.combine_URL: + if args.combine_url: + outp.write("comb: " + vars(args)["combine_url"] + " | ") + if args.combine_pdf: + outp.write("comb: " + vars(args)["combine_pdf"] + " | ") + if args.combine or args.combine_url or args.combine_pdf: outp.write("weight: " + str(vars(args)["weight"]) + " | ") outp.write("overlap: " + str(vars(args)["overlap"]) + " | ") outp.write("state size: " + str(vars(args)["state_size"]) + "\n") @@ -265,7 +232,7 @@ def main(): # write it! write(model, args.sentences, args.outfile, args.overlap, args.length) - # wrapping up: + # wrap up: print("\n: :\n") for key, value in vars(args).items(): print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10)) diff --git a/setup.py b/setup.py index 16ca460..dbf6ac7 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,8 @@ with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: setup( name="mkv-this", - version="0.2.2", - description="cli wrapper for markovify: take a text file or URL, markovify, save the results.", + version="0.2.3", + description="cli wrapper for markovify: take a text file, directory, pdf or url, markovify, save the results.", long_description=long_description, long_description_content_type="text/markdown", url="https://git.disroot.org/mousebot/mkv-this", @@ -24,6 +24,6 @@ setup( "mkv-this-dir = mkv_this.mkv_this_dir:main", ] }, - install_requires=["markovify", "argparse", "html2text", "requests",], + install_requires=["markovify", "argparse", "html2text", "requests", "pdfminer",], zip_safe=False, )