integrate -dir into main code
This commit is contained in:
parent
7063c8339b
commit
90c9fd7407
|
@ -3,10 +3,10 @@ import re
|
|||
import requests
|
||||
import markovify
|
||||
import sys
|
||||
import argparse
|
||||
import html2text
|
||||
|
||||
fnf = ': error: file not found. please provide a path to a really-existing file!'
|
||||
|
||||
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
||||
|
||||
|
||||
def URL(insert):
|
||||
|
@ -14,11 +14,13 @@ def URL(insert):
|
|||
try:
|
||||
req = requests.get(insert)
|
||||
req.raise_for_status()
|
||||
req.encoding = req.apparent_encoding
|
||||
# try to fix encoding issues
|
||||
except Exception as exc:
|
||||
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
|
||||
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
|
||||
sys.exit()
|
||||
else:
|
||||
print(': fetched URL.')
|
||||
print(": fetched URL.")
|
||||
return req.text
|
||||
|
||||
|
||||
|
@ -30,11 +32,11 @@ def convert_html(html):
|
|||
h2t.ignore_emphasis = True
|
||||
h2t.ignore_tables = True
|
||||
h2t.unicode_snob = True
|
||||
h2t.decode_errors = 'ignore'
|
||||
h2t.decode_errors = "ignore"
|
||||
h2t.escape_all = False # remove all noise if needed
|
||||
print(': URL converted to text')
|
||||
print(": URL converted to text")
|
||||
s = h2t.handle(html)
|
||||
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
|
||||
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
|
||||
return s
|
||||
|
||||
|
||||
|
@ -46,6 +48,11 @@ def read(infile):
|
|||
except UnicodeDecodeError:
|
||||
with open(infile, encoding="latin-1") as f:
|
||||
return f.read()
|
||||
except IsADirectoryError as exc:
|
||||
print(
|
||||
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
|
||||
)
|
||||
sys.exit()
|
||||
except FileNotFoundError:
|
||||
print(fnf)
|
||||
sys.exit()
|
||||
|
@ -53,92 +60,70 @@ def read(infile):
|
|||
|
||||
def mkbtext(texttype, args_ss, args_wf):
|
||||
""" build a markov model """
|
||||
return markovify.Text(texttype, state_size=args_ss,
|
||||
well_formed=args_wf)
|
||||
return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)
|
||||
|
||||
|
||||
def mkbnewline(texttype, args_ss, args_wf):
|
||||
""" build a markov model, newline """
|
||||
return markovify.NewlineText(texttype, state_size=args_ss,
|
||||
well_formed=args_wf)
|
||||
return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)
|
||||
|
||||
|
||||
def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
|
||||
""" actually make the damn litter-atchya """
|
||||
for i in range(args_sen):
|
||||
output = open(args_out, 'a') # append
|
||||
output.write(str(tmodel.make_short_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over,
|
||||
max_chars=args_len)) + '\n\n')
|
||||
output.write(str('*\n\n'))
|
||||
output = open(args_out, "a") # append
|
||||
output.write(
|
||||
str(
|
||||
tmodel.make_short_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
||||
)
|
||||
)
|
||||
+ "\n\n"
|
||||
)
|
||||
output.write(str("*\n\n"))
|
||||
output.close()
|
||||
|
||||
|
||||
def writesentence(tmodel, args_sen, args_out, args_over, args_len):
|
||||
""" actually make the damn litter-atchya, and short """
|
||||
for i in range(args_sen):
|
||||
output = open(args_out, 'a') # append
|
||||
output.write(str(tmodel.make_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over,
|
||||
max_chars=args_len)) + '\n\n')
|
||||
output.write(str('*\n\n'))
|
||||
output = open(args_out, "a") # append
|
||||
output.write(
|
||||
str(
|
||||
tmodel.make_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
||||
)
|
||||
)
|
||||
+ "\n\n"
|
||||
)
|
||||
output.write(str("*\n\n"))
|
||||
output.close()
|
||||
|
||||
|
||||
### functions for mkv_this_scr.py
|
||||
|
||||
def get_urls(st_url):
|
||||
""" fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mth/xx' """
|
||||
try:
|
||||
req = requests.get(st_url)
|
||||
req.raise_for_status()
|
||||
except Exception as exc:
|
||||
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
|
||||
sys.exit()
|
||||
# functions for directory:
|
||||
def dir_list(directory):
|
||||
# create a list of files to concatenate:
|
||||
matches = []
|
||||
if os.path.isdir(directory) is True:
|
||||
for root, dirnames, filenames in os.walk(directory):
|
||||
for filename in filenames:
|
||||
if filename.endswith((".txt", ".org", ".md")):
|
||||
matches.append(os.path.join(root, filename))
|
||||
print(": text files fetched and combined")
|
||||
else:
|
||||
print(': fetched initial URL.')
|
||||
soup = bs4.BeautifulSoup(req.text, "lxml")
|
||||
art_elem = soup.select('div[class="fc-item__header"] a[data-link-name="article"]') # pull the element containing article links.
|
||||
urls = []
|
||||
for i in range(len(art_elem)):
|
||||
urls = urls + [art_elem[i].attrs['href']]
|
||||
print(': fetched list of URLs')
|
||||
return urls # returns a LIST
|
||||
|
||||
|
||||
def scr_URLs(urls): # input a LIST
|
||||
""" actually fetch all the URLs obtained by get_urls """
|
||||
try:
|
||||
content = []
|
||||
for i in range(len(urls)):
|
||||
req = requests.get(urls[i])
|
||||
req.raise_for_status()
|
||||
content = content + [req.text] # SUPER slow.
|
||||
print(': fetched page ' + urls[i])
|
||||
except Exception as exc:
|
||||
print(f': There was a problem: {exc}.\n: There was trouble in your list of URLs')
|
||||
print(": error: please enter a valid directory")
|
||||
sys.exit()
|
||||
else:
|
||||
print(': fetched all pages.')
|
||||
return content
|
||||
return matches # returns a LIST of filenames
|
||||
|
||||
|
||||
def scr_convert_html(content): # takes a LIST of html pages
|
||||
""" convert all pages obtained by scr_URLs """
|
||||
h2t = html2text.HTML2Text()
|
||||
h2t.ignore_links = True
|
||||
h2t.images_to_alt = True
|
||||
h2t.ignore_emphasis = True
|
||||
h2t.ignore_tables = True
|
||||
h2t.unicode_snob = True
|
||||
h2t.decode_errors = 'ignore'
|
||||
h2t.escape_all = False # remove all noise if needed
|
||||
s = []
|
||||
for i in range(len(content)):
|
||||
s = s + [h2t.handle(content[i])] # convert
|
||||
t = []
|
||||
for i in range(len(s)):
|
||||
t = t + [re.sub('[#*]', '', s[i])] # remove hash/star from the 'markdown'
|
||||
u = ' '.join(t) # convert list to string
|
||||
print(': Pages converted to text')
|
||||
return u
|
||||
# feed this one the matches list:
|
||||
def dir_cat(matchlist, batchfile):
|
||||
# concatenate into batchfile.txt:
|
||||
with open(batchfile, "w") as outfile:
|
||||
for fname in matchlist:
|
||||
try:
|
||||
with open(fname, encoding="utf-8") as infile:
|
||||
outfile.write(infile.read())
|
||||
except UnicodeDecodeError:
|
||||
with open(fname, encoding="latin-1") as infile:
|
||||
outfile.write(infile.read())
|
||||
|
|
|
@ -20,47 +20,106 @@
|
|||
"""
|
||||
|
||||
|
||||
import re
|
||||
import requests
|
||||
import markovify
|
||||
import html2text
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence
|
||||
from .functions import (
|
||||
URL,
|
||||
convert_html,
|
||||
dir_list,
|
||||
dir_cat,
|
||||
read,
|
||||
mkbtext,
|
||||
mkbnewline,
|
||||
writesentence,
|
||||
writeshortsentence,
|
||||
)
|
||||
|
||||
# argparse
|
||||
def parse_the_args():
|
||||
parser = argparse.ArgumentParser(prog="mkv-this", description="markovify local text files or URLs and output the results to a local text file.",
|
||||
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.")
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="mkv-this",
|
||||
description="markovify local text files or URLs and output the results to a local text file.",
|
||||
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.",
|
||||
)
|
||||
|
||||
# positional args:
|
||||
parser.add_argument(
|
||||
'infile', help="the text file to process. NB: file cannot be empty.")
|
||||
parser.add_argument('outfile', nargs='?', default="./mkv-output.txt",
|
||||
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.")
|
||||
"infile", help="the text file to process. NB: file cannot be empty."
|
||||
)
|
||||
parser.add_argument(
|
||||
"outfile",
|
||||
nargs="?",
|
||||
default="./mkv-output.txt",
|
||||
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.",
|
||||
)
|
||||
|
||||
# optional args:
|
||||
parser.add_argument('-s', '--state-size', help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", type=int, default=2)
|
||||
parser.add_argument(
|
||||
'-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5)
|
||||
"-s",
|
||||
"--state-size",
|
||||
help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
|
||||
type=int,
|
||||
default=2,
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l', '--length', help="set maximum number of characters per sentence.", type=int)
|
||||
"-n",
|
||||
"--sentences",
|
||||
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
|
||||
type=int,
|
||||
default=5,
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--overlap', help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", type=float, default=0.5)
|
||||
"-l",
|
||||
"--length",
|
||||
help="set maximum number of characters per sentence.",
|
||||
type=int,
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--combine', help="provide an another text file to be combined with the first item.")
|
||||
parser.add_argument('-C', '--combine-URL',
|
||||
help="provide a URL to be combined with the first item")
|
||||
parser.add_argument('-w', '--weight', help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", type=float, default=1)
|
||||
"-o",
|
||||
"--overlap",
|
||||
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
|
||||
type=float,
|
||||
default=0.5,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--combine",
|
||||
help="provide an another text file to be combined with the first item.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w",
|
||||
"--weight",
|
||||
help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
|
||||
type=float,
|
||||
default=1,
|
||||
)
|
||||
|
||||
# switches
|
||||
parser.add_argument(
|
||||
'-u', '--URL', help="infile is a URL instead.", action='store_true')
|
||||
parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True.
|
||||
"-u", "--URL", help="infile is a URL instead.", action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--newline', help="sentences in input file end with newlines \
|
||||
rather than full stops.", action='store_true')
|
||||
"-d", "--directory", help="infile is a directory instead.", action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--well-formed",
|
||||
help="enforce 'well_formed': discard sentences containing []{}()"
|
||||
"'' from the markov model. use if output is filthy.",
|
||||
action="store_true",
|
||||
)
|
||||
# store_true = default to False.
|
||||
parser.add_argument(
|
||||
"--newline",
|
||||
help="sentences in input file end with newlines \
|
||||
rather than full stops.",
|
||||
action="store_true",
|
||||
)
|
||||
# store_true = default to False, become True if flagged.
|
||||
|
||||
return parser.parse_args()
|
||||
|
@ -79,6 +138,14 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -91,6 +158,13 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -108,14 +182,17 @@ def main():
|
|||
text_model = mkbtext(text, args.state_size, args.well_formed)
|
||||
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
|
||||
|
||||
combo_model = markovify.combine(
|
||||
[text_model, ctext_model], [1, args.weight])
|
||||
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
|
||||
|
||||
# write it combo!
|
||||
if args.length:
|
||||
writeshortsentence(combo_model, args.sentences, args.outfile, args.overlap, args.length)
|
||||
writeshortsentence(
|
||||
combo_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
else:
|
||||
writesentence(combo_model, args.sentences, args.outfile, args.overlap, args.length)
|
||||
writesentence(
|
||||
combo_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
|
||||
# if no -c/-C, do normal:
|
||||
else:
|
||||
|
@ -124,6 +201,12 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
# or local:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -138,21 +221,30 @@ def main():
|
|||
|
||||
# write it!
|
||||
if args.length:
|
||||
writeshortsentence(text_model, args.sentences, args.outfile, args.overlap, args.length)
|
||||
writeshortsentence(
|
||||
text_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
else:
|
||||
writesentence(text_model, args.sentences, args.outfile, args.overlap, args.length)
|
||||
writesentence(
|
||||
text_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
|
||||
print('\n: :\n')
|
||||
print("\n: :\n")
|
||||
for key, value in vars(args).items():
|
||||
print(': ' + key.ljust(15, ' ') + ': ' + str(value).ljust(10))
|
||||
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
|
||||
if os.path.isfile(args.outfile):
|
||||
print("\n: literary genius has been written to the file "
|
||||
+ args.outfile + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'")
|
||||
print(
|
||||
"\n: literary genius has been written to the file "
|
||||
+ args.outfile
|
||||
+ ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'"
|
||||
)
|
||||
else:
|
||||
print(': mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!')
|
||||
print(
|
||||
": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!"
|
||||
)
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in New Issue