add combine_pdf, merged all the if branches into one

2020-04-30 13:46:32 -03:00 · 2020-04-30 13:46:32 -03:00 · 2dcbe7149c
parent 1ed0760f7e
commit 2dcbe7149c
4 changed files with 74 additions and 98 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@

 `mkv-this` makes some of the features of the excellent [markovify](https://github.com/jsvine/markovify) module available as a command line tool. i started on it because i wanted to process my own offline files the same way [fedibooks](https://fedibooks.com) processes mastodon toots. then i published it to share with friends. i'm a novice coder, so you are a programmer and felt like picking it up and improving on it, then by all means!

-the rest of these notes are for end users rather than programmers.
+the rest of these notes are for end users.

 ### mkv-this

@ -53,9 +53,13 @@ if for some reason you want to concatenate some files yourself, you can easily d
 * run `cat * > outputfile.txt`
 * run mkv-this on the file: `mkv-this outputfile.txt`

+### pdfs
+
+since  0.2.3, `mkv-this` can take pdfs as input. but to do this you first need to download `pdfminer.six` with `pip`. because of its size, `pdfminer` is not installed by default with `mkv-this`. converting pdfs like this is not fast, and `mkv-this` must convert the pdf each time. so if you envisage using a large pdf many times, you would be better off converting it to plain text yourself.
+
 ### file types

-you need to input plain text files. currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.
+for directories of text files, the currently accepted file extensions are `.txt`, `.org` and `.md`. it is trivial to add others, so if you want one included just ask.

 if you don't have text files, but odt files, use a tool like `odt2txt` or `unoconv` to convert them to text en masse. both are available in the repos.

@ -80,6 +84,5 @@ i know nothing about macs so if you ask me for help i'll just send you random co
 ### todo

 * hook it up to a web-scraper.
-* hook it up to pdfs.
 * option to also append input model to a saved JSON file. (i.e. `text_model.to_json()`, `markovify.Text.from_json()`). that way you could build up a bank over time.
 * learn how to programme.
--- a/mkv_this/functions.py
+++ b/mkv_this/functions.py
@ -15,7 +15,7 @@ from io import StringIO
 fnf = ": error: file not found. please provide a path to a really-existing file!"


-def URL(insert):
+def url(insert):
    """ fetch a webpage, return it as html """
    try:
        req = requests.get(insert)
@ -23,10 +23,10 @@ def URL(insert):
        req.encoding = req.apparent_encoding
        # use chardet to catch encoding issue with ISO-8859-1/Latin-1.
    except Exception as exc:
-        print(f": There was a problem: {exc}.\n: Please enter a valid URL")
+        print(f": there was trouble: {exc}.\n: please enter a valid url.")
        sys.exit()
    else:
-        print(": fetched HTML from URL.")
+        print(": fetched html from url.")
        return req.text


@ -42,21 +42,27 @@ def convert_html(html):
    h2t.escape_all = False  # remove all noise if needed
    s = h2t.handle(html)
    s = re.sub("[#*]", "", s)  # remove hashes and stars from the 'markdown'
-    print(": URL converted to plain text")
+    print(": html converted to plain text")
    return s


 def read(infile):
    """ read a file so its ready for markov """
    try:
-        with open(infile, encoding="utf-8") as f:
-            return f.read()
+        if infile.lower().endswith(".pdf"):
+            print(
+                "looks like you entered a pdf file. you need to use the '-P' flag to convert it."
+            )
+            sys.exit()
+        else:
+            with open(infile, encoding="utf-8") as f:
+                return f.read()
    except UnicodeDecodeError:
        with open(infile, encoding="latin-1") as f:
            return f.read()
    except IsADirectoryError as exc:
        print(
-            f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
+            f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that."
        )
        sys.exit()
    except FileNotFoundError:
@ -138,8 +144,8 @@ def dir_cat(matchlist, bulkfile):


 # extract full text from a pdf:
-def convert_pdf_to_txt(path):
-    print("converting pdf file...")
+def convert_pdf(path):
+    print(": converting pdf file...")
    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
@ -170,7 +176,7 @@ def convert_pdf_to_txt(path):
        retstr.close()

    except Exception as exc:
-        print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
+        print(f": there was trouble: {exc}.\n: please enter a valid pdf")
        sys.exit()
    else:
        print(": pdf converted.")
--- a/mkv_this/mkv_this.py
+++ b/mkv_this/mkv_this.py
@ -1,7 +1,7 @@
 #! /usr/bin/env python3

 """
-    mkv-this: input a text file, directory, and/or url, output markovified text.
+    mkv-this: input a text file, directory, url and/or pdf, output markovified text.

    Copyright (C) 2020 martianhiatus@riseup.net.

@ -26,7 +26,7 @@ import sys
 import datetime
 import argparse
 from .functions import (
-    URL,
+    url,
    convert_html,
    dir_list,
    dir_cat,
@ -55,7 +55,7 @@ def parse_the_args():
    )
    # optional args:
    parser.add_argument(
-        "-u", "--URL", help="infile is a URL.", action="store_true",
+        "-u", "--url", help="infile is a URL.", action="store_true",
    )
    parser.add_argument(
        "-d",
@ -66,7 +66,7 @@ def parse_the_args():
    parser.add_argument(
        "-P",
        "--pdf",
-        help="infile is a pdf. NB: for this to work you need to install pdfminer.",
+        help="infile is a pdf. NB: for this to work you need to install pdfminer with pip.",
        action="store_true",
    )
    parser.add_argument(
@ -102,7 +102,12 @@ def parse_the_args():
        help="provide an another text file to be combined with the first item.",
    )
    parser.add_argument(
-        "-C", "--combine-URL", help="provide a URL to be combined with the first item."
+        "-C", "--combine-url", help="provide a URL to be combined with the first item."
+    )
+    parser.add_argument(
+        "-K",
+        "--combine-pdf",
+        help="provide a pdf to be combined with the first item. NB: for this to work you need to install pdfminer with pip.",
    )
    parser.add_argument(
        "-w",
@ -146,55 +151,36 @@ args = parse_the_args()


 def main():
-    # if a -c/-C, combine it w infile/URL:
-    if args.combine or args.combine_URL:
-        if args.combine:
-            # get raw text as a string for both:
-            # infile is URL:
-            if args.URL:
-                html = URL(args.infile)
-                text = convert_html(html)
-            # infile is dir:
-            elif args.directory:
-                matchlist = dir_list(args.infile)
-                # place batchfile.txt in user-given directory:
-                batchfile = args.infile + os.path.sep + "batchfile.txt"
-                dir_cat(matchlist, batchfile)
-                text = read(batchfile)
-                os.unlink(batchfile)
-            # infile is pdf:
-            elif args.pdf:
-                text = convert_pdf(args.infile)
-            # or normal:
-            else:
-                text = read(args.infile)
-            # read -c file:
-            ctext = read(args.combine)
+    # get raw text as a string for infile and -c/C if exists:
+    # infile is url:
+    if args.url:
+        html = url(args.infile)
+        text = convert_html(html)
+    # infile is dir:
+    elif args.directory:
+        matchlist = dir_list(args.infile)
+        # place batchfile.txt in user-given directory:
+        batchfile = args.infile + os.path.sep + "batchfile.txt"
+        dir_cat(matchlist, batchfile)
+        text = read(batchfile)
+        os.unlink(batchfile)
+    # infile is pdf:
+    elif args.pdf:
+        text = convert_pdf(args.infile)
+    # or normal:
+    else:
+        text = read(args.infile)

-        # if -C, combine it w infile/URL:
-        elif args.combine_URL:
-            # infile is URL:
-            if args.URL:
-                html = URL(args.infile)
-                text = convert_html(html)
-            # infile is dir:
-            elif args.directory:
-                matchlist = dir_list(args.infile)
-                # place batchfile.txt in args.infile:
-                batchfile = args.infile + os.path.sep + "batchfile.txt"
-                dir_cat(matchlist, batchfile)
-                text = read(batchfile)
-                os.unlink(batchfile)
-            elif args.pdf:
-                text = convert_pdf(args.infile)
-            # or normal:
-            else:
-                text = read(args.infile)
-            # now combine_URL:
-            html = URL(args.combine_URL)
-            ctext = convert_html(html)
+    if args.combine:
+        ctext = read(args.combine)
+    if args.combine_url:
+        html = url(args.combine_url)
+        ctext = convert_html(html)
+    if args.combine_pdf:
+        ctext = convert_pdf(args.combine_pdf)

-        # build the models + a combined model:
+    # build combined model:
+    if args.combine or args.combine_url or args.combine_pdf:
        # with --newline:
        if args.newline:
            text_model = mkbnewline(text, args.state_size, args.well_formed)
@ -203,39 +189,18 @@ def main():
        else:
            text_model = mkbtext(text, args.state_size, args.well_formed)
            ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
-
        combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
-
-    # if no -c/-C, do normal:
+    # build normal model:
    else:
-        # Get raw text as string.
-        # either URL:
-        if args.URL:
-            html = URL(args.infile)
-            text = convert_html(html)
-        elif args.directory:
-            matchlist = dir_list(args.infile)
-            # place batchfile.txt in user-given directory:
-            batchfile = args.infile + os.path.sep + "batchfile.txt"
-            dir_cat(matchlist, batchfile)
-            text = read(batchfile)
-            os.unlink(batchfile)
-        elif args.pdf:
-            text = convert_pdf_to_txt(args.infile)
-        # or local file:
-        else:
-            text = read(args.infile)
-
-        # Build the model:
-        # if --newline:
+        # with --newline:
        if args.newline:
            text_model = mkbnewline(text, args.state_size, args.well_formed)
        # no --newline:
        else:
            text_model = mkbtext(text, args.state_size, args.well_formed)

-    # merge the strains to prepare to write:
-    if args.combine or args.combine_URL:
+    # prepare to write:
+    if args.combine or args.combine_url or args.combine_pdf:
        model = combo_model
    else:
        model = text_model
@ -244,7 +209,7 @@ def main():
    else:
        write = writesentence

-    # optional headers in file:
+    # print optional headers in file:
    with open(args.outfile, "a") as outp:
        # optional print timestamp header:
        if args.timestamp:
@ -254,9 +219,11 @@ def main():
            outp.write("in: " + vars(args)["infile"] + " | ")
            if args.combine:
                outp.write("comb: " + vars(args)["combine"] + " | ")
-            if args.combine_URL:
-                outp.write("comb: " + vars(args)["combine_URL"] + " | ")
-            if args.combine or args.combine_URL:
+            if args.combine_url:
+                outp.write("comb: " + vars(args)["combine_url"] + " | ")
+            if args.combine_pdf:
+                outp.write("comb: " + vars(args)["combine_pdf"] + " | ")
+            if args.combine or args.combine_url or args.combine_pdf:
                outp.write("weight: " + str(vars(args)["weight"]) + " | ")
            outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
            outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
@ -265,7 +232,7 @@ def main():
    # write it!
    write(model, args.sentences, args.outfile, args.overlap, args.length)

-    # wrapping up:
+    # wrap up:
    print("\n:                :\n")
    for key, value in vars(args).items():
        print(": " + key.ljust(15, " ") + ":  " + str(value).ljust(10))
--- a/setup.py
+++ b/setup.py
@ -9,8 +9,8 @@ with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:

 setup(
    name="mkv-this",
-    version="0.2.2",
-    description="cli wrapper for markovify: take a text file or URL, markovify, save the results.",
+    version="0.2.3",
+    description="cli wrapper for markovify: take a text file, directory, pdf or url, markovify, save the results.",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://git.disroot.org/mousebot/mkv-this",
@ -24,6 +24,6 @@ setup(
            "mkv-this-dir = mkv_this.mkv_this_dir:main",
        ]
    },
-    install_requires=["markovify", "argparse", "html2text", "requests",],
+    install_requires=["markovify", "argparse", "html2text", "requests", "pdfminer",],
    zip_safe=False,
 )