catch encoding issue with requests for latin-1 urls + vacuuming

This commit is contained in:
mousebot 2020-04-26 11:42:48 -03:00
parent ac63cb6c7e
commit cdbb5d4265
4 changed files with 129 additions and 45 deletions

View File

@ -1,10 +1,10 @@
import os
import re
import requests
import markovify
import sys
import argparse
import html2text
#import bs4
# for _scr only
fnf = ': error: file not found. please provide a path to a really-existing file!'
@ -14,6 +14,8 @@ def URL(insert):
try:
req = requests.get(insert)
req.raise_for_status()
req.encoding = req.apparent_encoding
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
except Exception as exc:
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
sys.exit()
@ -25,16 +27,16 @@ def URL(insert):
def convert_html(html):
""" convert a fetched page to text """
h2t = html2text.HTML2Text()
h2t.ignore_links = True
h2t.images_to_alt = True
h2t.ignore_links = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = True
h2t.decode_errors = 'ignore'
h2t.escape_all = False # remove all noise if needed
print(': URL converted to text')
h2t.unicode_snob = False
h2t.decode_errors = 'replace'
h2t.escape_all = True # remove all noise if needed
s = h2t.handle(html)
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
print(': URL converted to text')
return s
@ -88,7 +90,7 @@ def writesentence(tmodel, args_sen, args_out, args_over, args_len):
### functions for mkv_this_scr.py
def get_urls(st_url):
""" fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mth/xx' """
""" fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mmm/xx' """
try:
req = requests.get(st_url)
req.raise_for_status()
@ -122,7 +124,7 @@ def scr_URLs(urls): # input a LIST
print(': fetched all pages.')
return content
def scr_convert_html(content): # takes a LIST of html pages
""" convert all pages obtained by scr_URLs """
h2t = html2text.HTML2Text()

View File

@ -23,48 +23,90 @@
import os
import sys
import argparse
import markovify
from .functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence
# argparse
def parse_the_args():
parser = argparse.ArgumentParser(prog="mkv-this", description="markovify local text files or URLs and output the results to a local text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.")
parser = argparse.ArgumentParser(
prog="mkv-this",
description="markovify local text files or URLs and output the results to a local text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial."
)
# positional args:
parser.add_argument(
'infile', help="the text file to process. NB: file cannot be empty.")
parser.add_argument('outfile', nargs='?', default="./mkv-output.txt",
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.")
'infile',
help="the text file to process. NB: file cannot be empty."
)
parser.add_argument(
'outfile',
nargs='?',
default="./mkv-output.txt",
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt."
)
# optional args:
parser.add_argument('-s', '--state-size', help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", type=int, default=2)
parser.add_argument(
'-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5)
'-s', '--state-size',
default=2,
type=int,
help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
)
parser.add_argument(
'-l', '--length', help="set maximum number of characters per sentence.", type=int)
'-n', '--sentences',
default=5,
type=int,
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
)
parser.add_argument(
'-o', '--overlap', help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", type=float, default=0.5)
'-l', '--length',
type=int,
help="set maximum number of characters per sentence.",
)
parser.add_argument(
'-c', '--combine', help="provide an another text file to be combined with the first item.")
parser.add_argument('-C', '--combine-URL',
help="provide a URL to be combined with the first item")
parser.add_argument('-w', '--weight', help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", type=float, default=1)
'-o', '--overlap',
default=0.5,
type=float,
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
)
parser.add_argument(
'-c', '--combine',
help="provide a second file to combine with first item."
)
parser.add_argument(
'-C', '--combine-URL',
help="provide a URL to combine with first item"
)
parser.add_argument(
'-w', '--weight',
default=1,
type=float,
help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
)
# switches
parser.add_argument(
'-u', '--URL', help="infile is a URL instead.", action='store_true')
'-u', '--URL',
action='store_true',
help="infile is a URL instead.",
)
# store_false = default to True.
parser.add_argument('-f', '--well-formed',
help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true')
parser.add_argument(
'--newline', help="sentences in input file end with newlines \
rather than full stops.", action='store_true')
'-f', '--well-formed',
action='store_true',
help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.",
)
parser.add_argument(
'--newline',
action='store_true',
help="sentences in input file end with newlines rather than full stops.",
)
# store_true = default to False, become True if flagged.
return parser.parse_args()
# make args avail:
args = parse_the_args()

View File

@ -22,43 +22,81 @@
import os
import sys
import argparse
import markovify
from .functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence
# argparse
def parse_the_args():
parser = argparse.ArgumentParser(prog="mkv-this-dir", description="markovify all text files in a director and output the results to a text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.")
parser = argparse.ArgumentParser(
prog="mkv-this-dir",
description="markovify all text files in a director and output the results to a text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial."
)
# positional args:
parser.add_argument(
'indir', help="the directory to extract the text of all text files from, with path.")
parser.add_argument('outfile', nargs='?', default="./mkv-dir-output.txt",
help="the file to save to, with path. if the file is used more than once, subsequent literature will be appended to the file after a star. defaults to ./mkv-dir-output.txt.")
'indir',
help="the directory to extract the text of all text files from, with path."
)
parser.add_argument(
'outfile',
nargs='?',
default="./mkv-dir-output.txt",
help="the file to save to, with path. if the file is used more than once, subsequent literature will be appended to the file after a star. defaults to ./mkv-dir-output.txt."
)
# optional args:
parser.add_argument(
'-s', '--state-size', help="the number of preceeding words the probability of the next word depends on. defaults to 2, 1 makes it more random, 3 less so.", type=int, default=2)
'-s', '--state-size',
default=2,
type=int,
help="the number of preceeding words the probability of the next word depends on. defaults to 2, 1 makes it more random, 3 less so.",
)
parser.add_argument(
'-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5)
'-n', '--sentences',
default=5,
type=int,
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
)
parser.add_argument(
'-l', '--length', help="set maximum number of characters per sentence.", type=int)
parser.add_argument('-o', '--overlap', help="the amount of overlap allowed between original text and the output, expressed as a radio between 0 and 1. lower values make it more random. defaults to 0.5", type=float, default=0.5)
parser.add_argument('-C', '--combine-URL',
help="provide a URL to be combined with the input dir")
parser.add_argument('-w', '--weight', help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", type=float, default=1)
'-l', '--length',
type=int,
help="set maximum number of characters per sentence.",
)
parser.add_argument(
'-o', '--overlap',
default=0.5,
type=float,
help="the amount of overlap allowed between original text and the output, expressed as a radio between 0 and 1. lower values make it more random. defaults to 0.5",
)
parser.add_argument(
'-C', '--combine-URL',
help="provide a URL to be combined with the input dir"
)
parser.add_argument(
'-w', '--weight',
default=1,
type=float,
help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.",
)
# switches
parser.add_argument('-f', '--well-formed',
help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", action='store_true')
parser.add_argument(
'-f', '--well-formed',
action='store_true',
help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.",
)
# store_false = default to True.
parser.add_argument(
'--newline', help="sentences in input file end with newlines rather than with full stops.", action='store_true')
'--newline',
action='store_true',
help="sentences in input file end with newlines rather than with full stops.",
)
# store_true = default to False, become True if flagged.
return parser.parse_args()
# make args avail:
args = parse_the_args()

View File

@ -7,7 +7,7 @@ with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(name='mkv-this',
version='0.1.42',
version='0.1.43',
description='cli wrapper for markovify: take a text file or URL, markovify, save the results.',
long_description=long_description,
long_description_content_type='text/markdown',
@ -26,6 +26,8 @@ setup(name='mkv-this',
'markovify',
'argparse',
'html2text',
'requests',
# 'bs4',
],
zip_safe=False,
)