integrate -dir into main code

2020-04-26 17:04:23 -03:00 · 2020-04-26 17:04:23 -03:00 · 90c9fd7407
parent 7063c8339b
commit 90c9fd7407
2 changed files with 188 additions and 111 deletions
--- a/mkv_this/functions.py
+++ b/mkv_this/functions.py
@ -3,10 +3,10 @@ import re
 import requests
 import markovify
 import sys
-import argparse
 import html2text

-fnf = ': error: file not found. please provide a path to a really-existing file!'
+
+fnf = ": error: file not found. please provide a path to a really-existing file!"


 def URL(insert):
@ -14,11 +14,13 @@ def URL(insert):
    try:
        req = requests.get(insert)
        req.raise_for_status()
+        req.encoding = req.apparent_encoding
+        # try to fix encoding issues
    except Exception as exc:
-        print(f': There was a problem: {exc}.\n: Please enter a valid URL')
+        print(f": There was a problem: {exc}.\n: Please enter a valid URL")
        sys.exit()
    else:
-        print(': fetched URL.')
+        print(": fetched URL.")
        return req.text


@ -30,11 +32,11 @@ def convert_html(html):
    h2t.ignore_emphasis = True
    h2t.ignore_tables = True
    h2t.unicode_snob = True
-    h2t.decode_errors = 'ignore'
+    h2t.decode_errors = "ignore"
    h2t.escape_all = False  # remove all noise if needed
-    print(': URL converted to text')
+    print(": URL converted to text")
    s = h2t.handle(html)
-    s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
+    s = re.sub("[#*]", "", s)  # remove hashes and stars from the 'markdown'
    return s


@ -46,6 +48,11 @@ def read(infile):
    except UnicodeDecodeError:
        with open(infile, encoding="latin-1") as f:
            return f.read()
+    except IsADirectoryError as exc:
+        print(
+            f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
+        )
+        sys.exit()
    except FileNotFoundError:
        print(fnf)
        sys.exit()
@ -53,92 +60,70 @@ def read(infile):

 def mkbtext(texttype, args_ss, args_wf):
    """ build a markov model """
-    return markovify.Text(texttype, state_size=args_ss,
-                          well_formed=args_wf)
+    return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)


 def mkbnewline(texttype, args_ss, args_wf):
    """ build a markov model, newline """
-    return markovify.NewlineText(texttype, state_size=args_ss,
-                                 well_formed=args_wf)
+    return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)


 def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
    """ actually make the damn litter-atchya """
    for i in range(args_sen):
-        output = open(args_out, 'a')  # append
-        output.write(str(tmodel.make_short_sentence(
-            tries=2000, max_overlap_ratio=args_over,
-            max_chars=args_len)) + '\n\n')
-    output.write(str('*\n\n'))
+        output = open(args_out, "a")  # append
+        output.write(
+            str(
+                tmodel.make_short_sentence(
+                    tries=2000, max_overlap_ratio=args_over, max_chars=args_len
+                )
+            )
+            + "\n\n"
+        )
+    output.write(str("*\n\n"))
    output.close()


 def writesentence(tmodel, args_sen, args_out, args_over, args_len):
    """ actually make the damn litter-atchya, and short """
    for i in range(args_sen):
-        output = open(args_out, 'a')  # append
-        output.write(str(tmodel.make_sentence(
-            tries=2000, max_overlap_ratio=args_over,
-            max_chars=args_len)) + '\n\n')
-    output.write(str('*\n\n'))
+        output = open(args_out, "a")  # append
+        output.write(
+            str(
+                tmodel.make_sentence(
+                    tries=2000, max_overlap_ratio=args_over, max_chars=args_len
+                )
+            )
+            + "\n\n"
+        )
+    output.write(str("*\n\n"))
    output.close()


-### functions for mkv_this_scr.py
-
-def get_urls(st_url):
-    """ fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mth/xx' """
-    try:
-        req = requests.get(st_url)
-        req.raise_for_status()
-    except Exception as exc:
-        print(f': There was a problem: {exc}.\n: Please enter a valid URL')
-        sys.exit()
+# functions for directory:
+def dir_list(directory):
+    # create a list of files to concatenate:
+    matches = []
+    if os.path.isdir(directory) is True:
+        for root, dirnames, filenames in os.walk(directory):
+            for filename in filenames:
+                if filename.endswith((".txt", ".org", ".md")):
+                    matches.append(os.path.join(root, filename))
+        print(": text files fetched and combined")
    else:
-        print(': fetched initial URL.')
-        soup = bs4.BeautifulSoup(req.text, "lxml")
-        art_elem = soup.select('div[class="fc-item__header"] a[data-link-name="article"]') # pull the element containing article links.
-        urls = []
-        for i in range(len(art_elem)):
-            urls = urls + [art_elem[i].attrs['href']]
-        print(': fetched list of URLs')
-        return urls # returns a LIST
-        
-
-def scr_URLs(urls): # input a LIST
-    """ actually fetch all the URLs obtained by get_urls """
-    try:
-        content = []
-        for i in range(len(urls)):
-            req = requests.get(urls[i])
-            req.raise_for_status()
-            content = content + [req.text] # SUPER slow.
-            print(': fetched page ' + urls[i])
-    except Exception as exc:
-        print(f': There was a problem: {exc}.\n: There was trouble in your list of URLs')
+        print(": error: please enter a valid directory")
        sys.exit()
-    else:
-        print(': fetched all pages.')
-        return content
+    return matches  # returns a LIST of filenames


-def scr_convert_html(content): # takes a LIST of html pages
-    """ convert all pages obtained by scr_URLs """
-    h2t = html2text.HTML2Text()
-    h2t.ignore_links = True
-    h2t.images_to_alt = True
-    h2t.ignore_emphasis = True
-    h2t.ignore_tables = True
-    h2t.unicode_snob = True
-    h2t.decode_errors = 'ignore'
-    h2t.escape_all = False # remove all noise if needed
-    s = []
-    for i in range(len(content)):
-        s = s + [h2t.handle(content[i])] # convert
-    t = []
-    for i in range(len(s)):
-        t = t + [re.sub('[#*]', '', s[i])] # remove hash/star from the 'markdown'
-    u = ' '.join(t) # convert list to string
-    print(': Pages converted to text')
-    return u
+# feed this one the matches list:
+def dir_cat(matchlist, batchfile):
+    # concatenate into batchfile.txt:
+    with open(batchfile, "w") as outfile:
+        for fname in matchlist:
+            try:
+                with open(fname, encoding="utf-8") as infile:
+                    outfile.write(infile.read())
+            except UnicodeDecodeError:
+                with open(fname, encoding="latin-1") as infile:
+                    outfile.write(infile.read())
--- a/mkv_this/mkv_this.py
+++ b/mkv_this/mkv_this.py
@ -20,47 +20,106 @@
 """


-import re
-import requests
 import markovify
-import html2text
 import os
 import sys
 import argparse
-from functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence
+from .functions import (
+    URL,
+    convert_html,
+    dir_list,
+    dir_cat,
+    read,
+    mkbtext,
+    mkbnewline,
+    writesentence,
+    writeshortsentence,
+)

 # argparse
 def parse_the_args():
-    parser = argparse.ArgumentParser(prog="mkv-this", description="markovify local text files or URLs and output the results to a local text file.",
-                                     epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.")
+    parser = argparse.ArgumentParser(
+        prog="mkv-this",
+        description="markovify local text files or URLs and output the results to a local text file.",
+        epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.",
+    )

    # positional args:
    parser.add_argument(
-        'infile', help="the text file to process. NB: file cannot be empty.")
-    parser.add_argument('outfile', nargs='?', default="./mkv-output.txt",
-                        help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.")
+        "infile", help="the text file to process. NB: file cannot be empty."
+    )
+    parser.add_argument(
+        "outfile",
+        nargs="?",
+        default="./mkv-output.txt",
+        help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.",
+    )

    # optional args:
-    parser.add_argument('-s', '--state-size', help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", type=int, default=2)
    parser.add_argument(
-        '-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5)
+        "-s",
+        "--state-size",
+        help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
+        type=int,
+        default=2,
+    )
    parser.add_argument(
-        '-l', '--length', help="set maximum number of characters per sentence.", type=int)
+        "-n",
+        "--sentences",
+        help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
+        type=int,
+        default=5,
+    )
    parser.add_argument(
-        '-o', '--overlap', help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", type=float, default=0.5)
+        "-l",
+        "--length",
+        help="set maximum number of characters per sentence.",
+        type=int,
+    )
    parser.add_argument(
-        '-c', '--combine', help="provide an another text file to be combined with the first item.")
-    parser.add_argument('-C', '--combine-URL',
-                        help="provide a URL to be combined with the first item")
-    parser.add_argument('-w', '--weight', help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", type=float, default=1)
+        "-o",
+        "--overlap",
+        help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
+        type=float,
+        default=0.5,
+    )
+    parser.add_argument(
+        "-c",
+        "--combine",
+        help="provide an another text file to be combined with the first item.",
+    )
+    parser.add_argument(
+        "-C", "--combine-URL", help="provide a URL to be combined with the first item"
+    )
+    parser.add_argument(
+        "-w",
+        "--weight",
+        help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
+        type=float,
+        default=1,
+    )

    # switches
    parser.add_argument(
-        '-u', '--URL', help="infile is a URL instead.", action='store_true')
-    parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True.
+        "-u", "--URL", help="infile is a URL instead.", action="store_true"
+    )
    parser.add_argument(
-        '--newline', help="sentences in input file end with newlines \
-        rather than full stops.", action='store_true')
+        "-d", "--directory", help="infile is a directory instead.", action="store_true"
+    )
+    parser.add_argument(
+        "-f",
+        "--well-formed",
+        help="enforce 'well_formed': discard sentences containing []{}()"
+        "'' from the markov model. use if output is filthy.",
+        action="store_true",
+    )
+    # store_true = default to False.
+    parser.add_argument(
+        "--newline",
+        help="sentences in input file end with newlines \
+        rather than full stops.",
+        action="store_true",
+    )
    # store_true = default to False, become True if flagged.

    return parser.parse_args()
@ -79,6 +138,14 @@ def main():
            if args.URL:
                html = URL(args.infile)
                text = convert_html(html)
+            # infile is dir:
+            elif args.directory:
+                matchlist = dir_list(args.infile)
+                # place batchfile.txt in user-given directory:
+                batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
+                dir_cat(matchlist, batchfile)
+                text = read(batchfile)
+                os.unlink(batchfile)
            # or normal:
            else:
                text = read(args.infile)
@ -91,6 +158,13 @@ def main():
            if args.URL:
                html = URL(args.infile)
                text = convert_html(html)
+            # infile is dir:
+            elif args.directory:
+                matchlist = dir_list(args.infile)
+                # place batchfile.txt in user-given directory:
+                batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
+                dir_cat(matchlist, batchfile)
+                text = read(batchfile)
            # or normal:
            else:
                text = read(args.infile)
@ -108,14 +182,17 @@ def main():
            text_model = mkbtext(text, args.state_size, args.well_formed)
            ctext_model = mkbtext(ctext, args.state_size, args.well_formed)

-        combo_model = markovify.combine(
-            [text_model, ctext_model], [1, args.weight])
+        combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])

        # write it combo!
        if args.length:
-            writeshortsentence(combo_model, args.sentences, args.outfile, args.overlap, args.length)
+            writeshortsentence(
+                combo_model, args.sentences, args.outfile, args.overlap, args.length
+            )
        else:
-            writesentence(combo_model, args.sentences, args.outfile, args.overlap, args.length)
+            writesentence(
+                combo_model, args.sentences, args.outfile, args.overlap, args.length
+            )

    # if no -c/-C, do normal:
    else:
@ -124,6 +201,12 @@ def main():
        if args.URL:
            html = URL(args.infile)
            text = convert_html(html)
+        elif args.directory:
+            matchlist = dir_list(args.infile)
+            # place batchfile.txt in user-given directory:
+            batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
+            dir_cat(matchlist, batchfile)
+            text = read(batchfile)
        # or local:
        else:
            text = read(args.infile)
@ -138,21 +221,30 @@ def main():

        # write it!
        if args.length:
-            writeshortsentence(text_model, args.sentences, args.outfile, args.overlap, args.length)
+            writeshortsentence(
+                text_model, args.sentences, args.outfile, args.overlap, args.length
+            )
        else:
-            writesentence(text_model, args.sentences, args.outfile, args.overlap, args.length)
+            writesentence(
+                text_model, args.sentences, args.outfile, args.overlap, args.length
+            )

-    print('\n:                :\n')
+    print("\n:                :\n")
    for key, value in vars(args).items():
-        print(': ' + key.ljust(15, ' ') + ':  ' + str(value).ljust(10))
+        print(": " + key.ljust(15, " ") + ":  " + str(value).ljust(10))
    if os.path.isfile(args.outfile):
-        print("\n:  literary genius has been written to the file "
-              + args.outfile + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'")
+        print(
+            "\n:  literary genius has been written to the file "
+            + args.outfile
+            + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'"
+        )
    else:
-        print(': mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!')
+        print(
+            ": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!"
+        )

    sys.exit()


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()