fix merge conflict

This commit is contained in:
mousebot 2020-04-26 17:13:31 -03:00
commit b757171209
3 changed files with 197 additions and 166 deletions

View File

@ -1,12 +1,12 @@
import os
import re
import requests
import markovify
import sys
import html2text
#import bs4
# for _scr only
fnf = ': error: file not found. please provide a path to a really-existing file!'
fnf = ": error: file not found. please provide a path to a really-existing file!"
def URL(insert):
@ -17,10 +17,10 @@ def URL(insert):
req.encoding = req.apparent_encoding
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
except Exception as exc:
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
sys.exit()
else:
print(': fetched URL.')
print(": fetched URL.")
return req.text
@ -32,11 +32,11 @@ def convert_html(html):
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = False
h2t.decode_errors = 'replace'
h2t.escape_all = True # remove all noise if needed
h2t.decode_errors = "replace"
h2t.escape_all = False # remove all noise if needed
s = h2t.handle(html)
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
print(': URL converted to text')
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
print(": URL converted to text")
return s
@ -48,6 +48,11 @@ def read(infile):
except UnicodeDecodeError:
with open(infile, encoding="latin-1") as f:
return f.read()
except IsADirectoryError as exc:
print(
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
)
sys.exit()
except FileNotFoundError:
print(fnf)
sys.exit()
@ -55,92 +60,70 @@ def read(infile):
def mkbtext(texttype, args_ss, args_wf):
""" build a markov model """
return markovify.Text(texttype, state_size=args_ss,
well_formed=args_wf)
return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)
def mkbnewline(texttype, args_ss, args_wf):
""" build a markov model, newline """
return markovify.NewlineText(texttype, state_size=args_ss,
well_formed=args_wf)
return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)
def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya """
for i in range(args_sen):
output = open(args_out, 'a') # append
output.write(str(tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over,
max_chars=args_len)) + '\n\n')
output.write(str('*\n\n'))
output = open(args_out, "a") # append
output.write(
str(
tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
def writesentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya, and short """
for i in range(args_sen):
output = open(args_out, 'a') # append
output.write(str(tmodel.make_sentence(
tries=2000, max_overlap_ratio=args_over,
max_chars=args_len)) + '\n\n')
output.write(str('*\n\n'))
output = open(args_out, "a") # append
output.write(
str(
tmodel.make_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
### functions for mkv_this_scr.py
def get_urls(st_url):
""" fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mmm/xx' """
try:
req = requests.get(st_url)
req.raise_for_status()
except Exception as exc:
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
sys.exit()
# functions for directory:
def dir_list(directory):
# create a list of files to concatenate:
matches = []
if os.path.isdir(directory) is True:
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith((".txt", ".org", ".md")):
matches.append(os.path.join(root, filename))
print(": text files fetched and combined")
else:
print(': fetched initial URL.')
soup = bs4.BeautifulSoup(req.text, "lxml")
art_elem = soup.select('div[class="fc-item__header"] a[data-link-name="article"]') # pull the element containing article links.
urls = []
for i in range(len(art_elem)):
urls = urls + [art_elem[i].attrs['href']]
print(': fetched list of URLs')
return urls # returns a LIST
def scr_URLs(urls): # input a LIST
""" actually fetch all the URLs obtained by get_urls """
try:
content = []
for i in range(len(urls)):
req = requests.get(urls[i])
req.raise_for_status()
content = content + [req.text] # SUPER slow.
print(': fetched page ' + urls[i])
except Exception as exc:
print(f': There was a problem: {exc}.\n: There was trouble in your list of URLs')
print(": error: please enter a valid directory")
sys.exit()
else:
print(': fetched all pages.')
return content
return matches # returns a LIST of filenames
def scr_convert_html(content): # takes a LIST of html pages
""" convert all pages obtained by scr_URLs """
h2t = html2text.HTML2Text()
h2t.ignore_links = True
h2t.images_to_alt = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = True
h2t.decode_errors = 'ignore'
h2t.escape_all = False # remove all noise if needed
s = []
for i in range(len(content)):
s = s + [h2t.handle(content[i])] # convert
t = []
for i in range(len(s)):
t = t + [re.sub('[#*]', '', s[i])] # remove hash/star from the 'markdown'
u = ' '.join(t) # convert list to string
print(': Pages converted to text')
return u
# feed this one the matches list:
def dir_cat(matchlist, batchfile):
# concatenate into batchfile.txt:
with open(batchfile, "w") as outfile:
for fname in matchlist:
try:
with open(fname, encoding="utf-8") as infile:
outfile.write(infile.read())
except UnicodeDecodeError:
with open(fname, encoding="latin-1") as infile:
outfile.write(infile.read())

View File

@ -20,93 +20,116 @@
"""
import markovify
import os
import sys
import argparse
import markovify
from .functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence
from .functions import (
URL,
convert_html,
dir_list,
dir_cat,
read,
mkbtext,
mkbnewline,
writesentence,
writeshortsentence,
)
# argparse
def parse_the_args():
parser = argparse.ArgumentParser(
prog="mkv-this",
description="markovify local text files or URLs and output the results to a local text file.",
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial."
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.",
)
# positional args:
parser.add_argument(
'infile',
help="the text file to process. NB: file cannot be empty."
"infile", help="the text file to process. NB: file cannot be empty."
)
parser.add_argument(
'outfile',
nargs='?',
"outfile",
nargs="?",
default="./mkv-output.txt",
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt."
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.",
)
# optional args:
parser.add_argument(
'-s', '--state-size',
default=2,
type=int,
"-u",
"--URL",
help="infile is a URL instead. all text it contains will be used.",
action="store_true",
)
parser.add_argument(
"-d",
"--directory",
help="infile is a directory instead. all text files in it will be used.",
action="store_true",
)
parser.add_argument(
"-s",
"--state-size",
help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
type=int,
default=2,
)
parser.add_argument(
'-n', '--sentences',
default=5,
type=int,
"-n",
"--sentences",
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
)
parser.add_argument(
'-l', '--length',
type=int,
default=5,
)
parser.add_argument(
"-l",
"--length",
help="set maximum number of characters per sentence.",
type=int,
)
parser.add_argument(
'-o', '--overlap',
default=0.5,
type=float,
"-o",
"--overlap",
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
)
parser.add_argument(
'-c', '--combine',
help="provide a second file to combine with first item."
)
parser.add_argument(
'-C', '--combine-URL',
help="provide a URL to combine with first item"
)
parser.add_argument(
'-w', '--weight',
default=1,
type=float,
default=0.5,
)
parser.add_argument(
"-c",
"--combine",
help="provide an another text file to be combined with the first item.",
)
parser.add_argument(
"-C", "--combine-URL", help="provide a URL to be combined with the first item"
)
parser.add_argument(
"-w",
"--weight",
help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
type=float,
default=1,
)
# switches
parser.add_argument(
'-u', '--URL',
action='store_true',
help="infile is a URL instead.",
"-f",
"--well-formed",
help="enforce 'well_formed': discard sentences containing []{}()"
"'' from the markov model. use if output is filthy.",
action="store_true",
)
# store_false = default to True.
# store_true = default to False.
parser.add_argument(
'-f', '--well-formed',
action='store_true',
help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.",
)
parser.add_argument(
'--newline',
action='store_true',
help="sentences in input file end with newlines rather than full stops.",
"--newline",
help="sentences in input file end with newlines \
rather than full stops.",
action="store_true",
)
# store_true = default to False, become True if flagged.
return parser.parse_args()
# make args avail:
args = parse_the_args()
@ -120,7 +143,15 @@ def main():
if args.URL:
html = URL(args.infile)
text = convert_html(html)
# or normal:
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
# or normal:
else:
text = read(args.infile)
# read -c file:
@ -132,7 +163,14 @@ def main():
if args.URL:
html = URL(args.infile)
text = convert_html(html)
# or normal:
# infile is dir:
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
# or normal:
else:
text = read(args.infile)
# now combine_URL:
@ -149,16 +187,17 @@ def main():
text_model = mkbtext(text, args.state_size, args.well_formed)
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
combo_model = markovify.combine(
[text_model, ctext_model], [1, args.weight])
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
# write it combo!
if args.length:
writeshortsentence(combo_model, args.sentences,
args.outfile, args.overlap, args.length)
writeshortsentence(
combo_model, args.sentences, args.outfile, args.overlap, args.length
)
else:
writesentence(combo_model, args.sentences,
args.outfile, args.overlap, args.length)
writesentence(
combo_model, args.sentences, args.outfile, args.overlap, args.length
)
# if no -c/-C, do normal:
else:
@ -167,6 +206,12 @@ def main():
if args.URL:
html = URL(args.infile)
text = convert_html(html)
elif args.directory:
matchlist = dir_list(args.infile)
# place batchfile.txt in user-given directory:
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
dir_cat(matchlist, batchfile)
text = read(batchfile)
# or local:
else:
text = read(args.infile)
@ -181,23 +226,30 @@ def main():
# write it!
if args.length:
writeshortsentence(text_model, args.sentences,
args.outfile, args.overlap, args.length)
writeshortsentence(
text_model, args.sentences, args.outfile, args.overlap, args.length
)
else:
writesentence(text_model, args.sentences,
args.outfile, args.overlap, args.length)
writesentence(
text_model, args.sentences, args.outfile, args.overlap, args.length
)
print('\n: :\n')
print("\n: :\n")
for key, value in vars(args).items():
print(': ' + key.ljust(15, ' ') + ': ' + str(value).ljust(10))
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
if os.path.isfile(args.outfile):
print("\n: literary genius has been written to the file "
+ args.outfile + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'")
print(
"\n: literary genius has been written to the file "
+ args.outfile
+ ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'"
)
else:
print(': mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!')
print(
": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!"
)
sys.exit()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -2,32 +2,28 @@ from setuptools import setup, find_packages
# read the contents of your README file as long_description:
from os import path
this_directory = path.abspath(path.dirname(__file__))
with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
long_description = f.read()
setup(name='mkv-this',
version='0.1.43',
description='cli wrapper for markovify: take a text file or URL, markovify, save the results.',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://git.disroot.org/mousebot/mkv-this',
author='mousebot',
author_email='martianhiatus@riseup.net',
license='AGPLv3',
packages=find_packages(),
entry_points={
'console_scripts': [
'mkv-this = mkv_this.mkv_this:main',
'mkv-this-dir = mkv_this.mkv_this_dir:main'
]
},
install_requires=[
'markovify',
'argparse',
'html2text',
'requests',
# 'bs4',
],
zip_safe=False,
)
setup(
name="mkv-this",
version="0.1.43",
description="cli wrapper for markovify: take a text file or URL, markovify, save the results.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://git.disroot.org/mousebot/mkv-this",
author="mousebot",
author_email="martianhiatus@riseup.net",
license="AGPLv3",
packages=find_packages(),
entry_points={
"console_scripts": [
"mkv-this = mkv_this.mkv_this:main",
"mkv-this-dir = mkv_this.mkv_this_dir:main",
]
},
install_requires=["markovify", "argparse", "html2text", "requests",],
zip_safe=False,
)