From cdbb5d4265ab33c4c46ca95ac25e41e7135c6cfd Mon Sep 17 00:00:00 2001 From: mousebot Date: Sun, 26 Apr 2020 11:42:48 -0300 Subject: [PATCH] catch encoding issue with requests for latin-1 urls + vacuuming --- mkv_this/functions.py | 20 +++++----- mkv_this/mkv_this.py | 80 ++++++++++++++++++++++++++++++---------- mkv_this/mkv_this_dir.py | 70 +++++++++++++++++++++++++++-------- setup.py | 4 +- 4 files changed, 129 insertions(+), 45 deletions(-) diff --git a/mkv_this/functions.py b/mkv_this/functions.py index 0ea4be2..f846635 100644 --- a/mkv_this/functions.py +++ b/mkv_this/functions.py @@ -1,10 +1,10 @@ -import os import re import requests import markovify import sys -import argparse import html2text +#import bs4 +# for _scr only fnf = ': error: file not found. please provide a path to a really-existing file!' @@ -14,6 +14,8 @@ def URL(insert): try: req = requests.get(insert) req.raise_for_status() + req.encoding = req.apparent_encoding + # use chardet to catch encoding issue with ISO-8859-1/Latin-1. except Exception as exc: print(f': There was a problem: {exc}.\n: Please enter a valid URL') sys.exit() @@ -25,16 +27,16 @@ def URL(insert): def convert_html(html): """ convert a fetched page to text """ h2t = html2text.HTML2Text() - h2t.ignore_links = True h2t.images_to_alt = True + h2t.ignore_links = True h2t.ignore_emphasis = True h2t.ignore_tables = True - h2t.unicode_snob = True - h2t.decode_errors = 'ignore' - h2t.escape_all = False # remove all noise if needed - print(': URL converted to text') + h2t.unicode_snob = False + h2t.decode_errors = 'replace' + h2t.escape_all = True # remove all noise if needed s = h2t.handle(html) s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown' + print(': URL converted to text') return s @@ -88,7 +90,7 @@ def writesentence(tmodel, args_sen, args_out, args_over, args_len): ### functions for mkv_this_scr.py def get_urls(st_url): - """ fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mth/xx' """ + """ fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mmm/xx' """ try: req = requests.get(st_url) req.raise_for_status() @@ -122,7 +124,7 @@ def scr_URLs(urls): # input a LIST print(': fetched all pages.') return content - + def scr_convert_html(content): # takes a LIST of html pages """ convert all pages obtained by scr_URLs """ h2t = html2text.HTML2Text() diff --git a/mkv_this/mkv_this.py b/mkv_this/mkv_this.py index 9a710cd..71eebe1 100755 --- a/mkv_this/mkv_this.py +++ b/mkv_this/mkv_this.py @@ -23,48 +23,90 @@ import os import sys import argparse +import markovify from .functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence # argparse def parse_the_args(): - parser = argparse.ArgumentParser(prog="mkv-this", description="markovify local text files or URLs and output the results to a local text file.", - epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.") + parser = argparse.ArgumentParser( + prog="mkv-this", + description="markovify local text files or URLs and output the results to a local text file.", + epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial." + ) # positional args: parser.add_argument( - 'infile', help="the text file to process. NB: file cannot be empty.") - parser.add_argument('outfile', nargs='?', default="./mkv-output.txt", - help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.") + 'infile', + help="the text file to process. NB: file cannot be empty." + ) + parser.add_argument( + 'outfile', + nargs='?', + default="./mkv-output.txt", + help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt." + ) # optional args: - parser.add_argument('-s', '--state-size', help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", type=int, default=2) parser.add_argument( - '-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5) + '-s', '--state-size', + default=2, + type=int, + help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", + ) parser.add_argument( - '-l', '--length', help="set maximum number of characters per sentence.", type=int) + '-n', '--sentences', + default=5, + type=int, + help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", + ) parser.add_argument( - '-o', '--overlap', help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", type=float, default=0.5) + '-l', '--length', + type=int, + help="set maximum number of characters per sentence.", + ) parser.add_argument( - '-c', '--combine', help="provide an another text file to be combined with the first item.") - parser.add_argument('-C', '--combine-URL', - help="provide a URL to be combined with the first item") - parser.add_argument('-w', '--weight', help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", type=float, default=1) + '-o', '--overlap', + default=0.5, + type=float, + help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", + ) + parser.add_argument( + '-c', '--combine', + help="provide a second file to combine with first item." + ) + parser.add_argument( + '-C', '--combine-URL', + help="provide a URL to combine with first item" + ) + parser.add_argument( + '-w', '--weight', + default=1, + type=float, + help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", + ) # switches parser.add_argument( - '-u', '--URL', help="infile is a URL instead.", action='store_true') + '-u', '--URL', + action='store_true', + help="infile is a URL instead.", + ) # store_false = default to True. - parser.add_argument('-f', '--well-formed', - help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') parser.add_argument( - '--newline', help="sentences in input file end with newlines \ - rather than full stops.", action='store_true') + '-f', '--well-formed', + action='store_true', + help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", + ) + parser.add_argument( + '--newline', + action='store_true', + help="sentences in input file end with newlines rather than full stops.", + ) # store_true = default to False, become True if flagged. return parser.parse_args() - # make args avail: args = parse_the_args() diff --git a/mkv_this/mkv_this_dir.py b/mkv_this/mkv_this_dir.py index f2fd009..8cd061c 100644 --- a/mkv_this/mkv_this_dir.py +++ b/mkv_this/mkv_this_dir.py @@ -22,43 +22,81 @@ import os import sys import argparse +import markovify from .functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence # argparse def parse_the_args(): - parser = argparse.ArgumentParser(prog="mkv-this-dir", description="markovify all text files in a director and output the results to a text file.", - epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.") + parser = argparse.ArgumentParser( + prog="mkv-this-dir", + description="markovify all text files in a director and output the results to a text file.", + epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial." + ) # positional args: parser.add_argument( - 'indir', help="the directory to extract the text of all text files from, with path.") - parser.add_argument('outfile', nargs='?', default="./mkv-dir-output.txt", - help="the file to save to, with path. if the file is used more than once, subsequent literature will be appended to the file after a star. defaults to ./mkv-dir-output.txt.") + 'indir', + help="the directory to extract the text of all text files from, with path." + ) + parser.add_argument( + 'outfile', + nargs='?', + default="./mkv-dir-output.txt", + help="the file to save to, with path. if the file is used more than once, subsequent literature will be appended to the file after a star. defaults to ./mkv-dir-output.txt." + ) # optional args: parser.add_argument( - '-s', '--state-size', help="the number of preceeding words the probability of the next word depends on. defaults to 2, 1 makes it more random, 3 less so.", type=int, default=2) + '-s', '--state-size', + default=2, + type=int, + help="the number of preceeding words the probability of the next word depends on. defaults to 2, 1 makes it more random, 3 less so.", + ) parser.add_argument( - '-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5) + '-n', '--sentences', + default=5, + type=int, + help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", + ) parser.add_argument( - '-l', '--length', help="set maximum number of characters per sentence.", type=int) - parser.add_argument('-o', '--overlap', help="the amount of overlap allowed between original text and the output, expressed as a radio between 0 and 1. lower values make it more random. defaults to 0.5", type=float, default=0.5) - parser.add_argument('-C', '--combine-URL', - help="provide a URL to be combined with the input dir") - parser.add_argument('-w', '--weight', help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", type=float, default=1) + '-l', '--length', + type=int, + help="set maximum number of characters per sentence.", + ) + parser.add_argument( + '-o', '--overlap', + default=0.5, + type=float, + help="the amount of overlap allowed between original text and the output, expressed as a radio between 0 and 1. lower values make it more random. defaults to 0.5", + ) + parser.add_argument( + '-C', '--combine-URL', + help="provide a URL to be combined with the input dir" + ) + parser.add_argument( + '-w', '--weight', + default=1, + type=float, + help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", + ) # switches - parser.add_argument('-f', '--well-formed', - help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", action='store_true') + parser.add_argument( + '-f', '--well-formed', + action='store_true', + help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", + ) # store_false = default to True. parser.add_argument( - '--newline', help="sentences in input file end with newlines rather than with full stops.", action='store_true') + '--newline', + action='store_true', + help="sentences in input file end with newlines rather than with full stops.", + ) # store_true = default to False, become True if flagged. return parser.parse_args() - # make args avail: args = parse_the_args() diff --git a/setup.py b/setup.py index 76012fc..0918c1f 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: long_description = f.read() setup(name='mkv-this', - version='0.1.42', + version='0.1.43', description='cli wrapper for markovify: take a text file or URL, markovify, save the results.', long_description=long_description, long_description_content_type='text/markdown', @@ -26,6 +26,8 @@ setup(name='mkv-this', 'markovify', 'argparse', 'html2text', + 'requests', +# 'bs4', ], zip_safe=False, )