fix merge conflict
This commit is contained in:
commit
b757171209
|
@ -1,12 +1,12 @@
|
|||
import os
|
||||
import re
|
||||
import requests
|
||||
import markovify
|
||||
import sys
|
||||
import html2text
|
||||
#import bs4
|
||||
# for _scr only
|
||||
|
||||
fnf = ': error: file not found. please provide a path to a really-existing file!'
|
||||
|
||||
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
||||
|
||||
|
||||
def URL(insert):
|
||||
|
@ -17,10 +17,10 @@ def URL(insert):
|
|||
req.encoding = req.apparent_encoding
|
||||
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
|
||||
except Exception as exc:
|
||||
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
|
||||
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
|
||||
sys.exit()
|
||||
else:
|
||||
print(': fetched URL.')
|
||||
print(": fetched URL.")
|
||||
return req.text
|
||||
|
||||
|
||||
|
@ -32,11 +32,11 @@ def convert_html(html):
|
|||
h2t.ignore_emphasis = True
|
||||
h2t.ignore_tables = True
|
||||
h2t.unicode_snob = False
|
||||
h2t.decode_errors = 'replace'
|
||||
h2t.escape_all = True # remove all noise if needed
|
||||
h2t.decode_errors = "replace"
|
||||
h2t.escape_all = False # remove all noise if needed
|
||||
s = h2t.handle(html)
|
||||
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
|
||||
print(': URL converted to text')
|
||||
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
|
||||
print(": URL converted to text")
|
||||
return s
|
||||
|
||||
|
||||
|
@ -48,6 +48,11 @@ def read(infile):
|
|||
except UnicodeDecodeError:
|
||||
with open(infile, encoding="latin-1") as f:
|
||||
return f.read()
|
||||
except IsADirectoryError as exc:
|
||||
print(
|
||||
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
|
||||
)
|
||||
sys.exit()
|
||||
except FileNotFoundError:
|
||||
print(fnf)
|
||||
sys.exit()
|
||||
|
@ -55,92 +60,70 @@ def read(infile):
|
|||
|
||||
def mkbtext(texttype, args_ss, args_wf):
|
||||
""" build a markov model """
|
||||
return markovify.Text(texttype, state_size=args_ss,
|
||||
well_formed=args_wf)
|
||||
return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)
|
||||
|
||||
|
||||
def mkbnewline(texttype, args_ss, args_wf):
|
||||
""" build a markov model, newline """
|
||||
return markovify.NewlineText(texttype, state_size=args_ss,
|
||||
well_formed=args_wf)
|
||||
return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)
|
||||
|
||||
|
||||
def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
|
||||
""" actually make the damn litter-atchya """
|
||||
for i in range(args_sen):
|
||||
output = open(args_out, 'a') # append
|
||||
output.write(str(tmodel.make_short_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over,
|
||||
max_chars=args_len)) + '\n\n')
|
||||
output.write(str('*\n\n'))
|
||||
output = open(args_out, "a") # append
|
||||
output.write(
|
||||
str(
|
||||
tmodel.make_short_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
||||
)
|
||||
)
|
||||
+ "\n\n"
|
||||
)
|
||||
output.write(str("*\n\n"))
|
||||
output.close()
|
||||
|
||||
|
||||
def writesentence(tmodel, args_sen, args_out, args_over, args_len):
|
||||
""" actually make the damn litter-atchya, and short """
|
||||
for i in range(args_sen):
|
||||
output = open(args_out, 'a') # append
|
||||
output.write(str(tmodel.make_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over,
|
||||
max_chars=args_len)) + '\n\n')
|
||||
output.write(str('*\n\n'))
|
||||
output = open(args_out, "a") # append
|
||||
output.write(
|
||||
str(
|
||||
tmodel.make_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
||||
)
|
||||
)
|
||||
+ "\n\n"
|
||||
)
|
||||
output.write(str("*\n\n"))
|
||||
output.close()
|
||||
|
||||
|
||||
### functions for mkv_this_scr.py
|
||||
|
||||
def get_urls(st_url):
|
||||
""" fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mmm/xx' """
|
||||
try:
|
||||
req = requests.get(st_url)
|
||||
req.raise_for_status()
|
||||
except Exception as exc:
|
||||
print(f': There was a problem: {exc}.\n: Please enter a valid URL')
|
||||
sys.exit()
|
||||
# functions for directory:
|
||||
def dir_list(directory):
|
||||
# create a list of files to concatenate:
|
||||
matches = []
|
||||
if os.path.isdir(directory) is True:
|
||||
for root, dirnames, filenames in os.walk(directory):
|
||||
for filename in filenames:
|
||||
if filename.endswith((".txt", ".org", ".md")):
|
||||
matches.append(os.path.join(root, filename))
|
||||
print(": text files fetched and combined")
|
||||
else:
|
||||
print(': fetched initial URL.')
|
||||
soup = bs4.BeautifulSoup(req.text, "lxml")
|
||||
art_elem = soup.select('div[class="fc-item__header"] a[data-link-name="article"]') # pull the element containing article links.
|
||||
urls = []
|
||||
for i in range(len(art_elem)):
|
||||
urls = urls + [art_elem[i].attrs['href']]
|
||||
print(': fetched list of URLs')
|
||||
return urls # returns a LIST
|
||||
|
||||
|
||||
def scr_URLs(urls): # input a LIST
|
||||
""" actually fetch all the URLs obtained by get_urls """
|
||||
try:
|
||||
content = []
|
||||
for i in range(len(urls)):
|
||||
req = requests.get(urls[i])
|
||||
req.raise_for_status()
|
||||
content = content + [req.text] # SUPER slow.
|
||||
print(': fetched page ' + urls[i])
|
||||
except Exception as exc:
|
||||
print(f': There was a problem: {exc}.\n: There was trouble in your list of URLs')
|
||||
print(": error: please enter a valid directory")
|
||||
sys.exit()
|
||||
else:
|
||||
print(': fetched all pages.')
|
||||
return content
|
||||
return matches # returns a LIST of filenames
|
||||
|
||||
|
||||
def scr_convert_html(content): # takes a LIST of html pages
|
||||
""" convert all pages obtained by scr_URLs """
|
||||
h2t = html2text.HTML2Text()
|
||||
h2t.ignore_links = True
|
||||
h2t.images_to_alt = True
|
||||
h2t.ignore_emphasis = True
|
||||
h2t.ignore_tables = True
|
||||
h2t.unicode_snob = True
|
||||
h2t.decode_errors = 'ignore'
|
||||
h2t.escape_all = False # remove all noise if needed
|
||||
s = []
|
||||
for i in range(len(content)):
|
||||
s = s + [h2t.handle(content[i])] # convert
|
||||
t = []
|
||||
for i in range(len(s)):
|
||||
t = t + [re.sub('[#*]', '', s[i])] # remove hash/star from the 'markdown'
|
||||
u = ' '.join(t) # convert list to string
|
||||
print(': Pages converted to text')
|
||||
return u
|
||||
# feed this one the matches list:
|
||||
def dir_cat(matchlist, batchfile):
|
||||
# concatenate into batchfile.txt:
|
||||
with open(batchfile, "w") as outfile:
|
||||
for fname in matchlist:
|
||||
try:
|
||||
with open(fname, encoding="utf-8") as infile:
|
||||
outfile.write(infile.read())
|
||||
except UnicodeDecodeError:
|
||||
with open(fname, encoding="latin-1") as infile:
|
||||
outfile.write(infile.read())
|
||||
|
|
|
@ -20,93 +20,116 @@
|
|||
"""
|
||||
|
||||
|
||||
import markovify
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import markovify
|
||||
from .functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence
|
||||
|
||||
from .functions import (
|
||||
URL,
|
||||
convert_html,
|
||||
dir_list,
|
||||
dir_cat,
|
||||
read,
|
||||
mkbtext,
|
||||
mkbnewline,
|
||||
writesentence,
|
||||
writeshortsentence,
|
||||
)
|
||||
|
||||
# argparse
|
||||
def parse_the_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="mkv-this",
|
||||
description="markovify local text files or URLs and output the results to a local text file.",
|
||||
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial."
|
||||
epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.",
|
||||
)
|
||||
|
||||
# positional args:
|
||||
parser.add_argument(
|
||||
'infile',
|
||||
help="the text file to process. NB: file cannot be empty."
|
||||
"infile", help="the text file to process. NB: file cannot be empty."
|
||||
)
|
||||
parser.add_argument(
|
||||
'outfile',
|
||||
nargs='?',
|
||||
"outfile",
|
||||
nargs="?",
|
||||
default="./mkv-output.txt",
|
||||
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt."
|
||||
help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.",
|
||||
)
|
||||
|
||||
# optional args:
|
||||
parser.add_argument(
|
||||
'-s', '--state-size',
|
||||
default=2,
|
||||
type=int,
|
||||
"-u",
|
||||
"--URL",
|
||||
help="infile is a URL instead. all text it contains will be used.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--directory",
|
||||
help="infile is a directory instead. all text files in it will be used.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--state-size",
|
||||
help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.",
|
||||
type=int,
|
||||
default=2,
|
||||
)
|
||||
parser.add_argument(
|
||||
'-n', '--sentences',
|
||||
default=5,
|
||||
type=int,
|
||||
"-n",
|
||||
"--sentences",
|
||||
help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l', '--length',
|
||||
type=int,
|
||||
default=5,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--length",
|
||||
help="set maximum number of characters per sentence.",
|
||||
type=int,
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--overlap',
|
||||
default=0.5,
|
||||
type=float,
|
||||
"-o",
|
||||
"--overlap",
|
||||
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--combine',
|
||||
help="provide a second file to combine with first item."
|
||||
)
|
||||
parser.add_argument(
|
||||
'-C', '--combine-URL',
|
||||
help="provide a URL to combine with first item"
|
||||
)
|
||||
parser.add_argument(
|
||||
'-w', '--weight',
|
||||
default=1,
|
||||
type=float,
|
||||
default=0.5,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--combine",
|
||||
help="provide an another text file to be combined with the first item.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w",
|
||||
"--weight",
|
||||
help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.",
|
||||
type=float,
|
||||
default=1,
|
||||
)
|
||||
|
||||
# switches
|
||||
parser.add_argument(
|
||||
'-u', '--URL',
|
||||
action='store_true',
|
||||
help="infile is a URL instead.",
|
||||
"-f",
|
||||
"--well-formed",
|
||||
help="enforce 'well_formed': discard sentences containing []{}()"
|
||||
"'' from the markov model. use if output is filthy.",
|
||||
action="store_true",
|
||||
)
|
||||
# store_false = default to True.
|
||||
# store_true = default to False.
|
||||
parser.add_argument(
|
||||
'-f', '--well-formed',
|
||||
action='store_true',
|
||||
help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--newline',
|
||||
action='store_true',
|
||||
help="sentences in input file end with newlines rather than full stops.",
|
||||
"--newline",
|
||||
help="sentences in input file end with newlines \
|
||||
rather than full stops.",
|
||||
action="store_true",
|
||||
)
|
||||
# store_true = default to False, become True if flagged.
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
# make args avail:
|
||||
args = parse_the_args()
|
||||
|
||||
|
@ -120,7 +143,15 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# or normal:
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
# read -c file:
|
||||
|
@ -132,7 +163,14 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# or normal:
|
||||
# infile is dir:
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
# now combine_URL:
|
||||
|
@ -149,16 +187,17 @@ def main():
|
|||
text_model = mkbtext(text, args.state_size, args.well_formed)
|
||||
ctext_model = mkbtext(ctext, args.state_size, args.well_formed)
|
||||
|
||||
combo_model = markovify.combine(
|
||||
[text_model, ctext_model], [1, args.weight])
|
||||
combo_model = markovify.combine([text_model, ctext_model], [1, args.weight])
|
||||
|
||||
# write it combo!
|
||||
if args.length:
|
||||
writeshortsentence(combo_model, args.sentences,
|
||||
args.outfile, args.overlap, args.length)
|
||||
writeshortsentence(
|
||||
combo_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
else:
|
||||
writesentence(combo_model, args.sentences,
|
||||
args.outfile, args.overlap, args.length)
|
||||
writesentence(
|
||||
combo_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
|
||||
# if no -c/-C, do normal:
|
||||
else:
|
||||
|
@ -167,6 +206,12 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
elif args.directory:
|
||||
matchlist = dir_list(args.infile)
|
||||
# place batchfile.txt in user-given directory:
|
||||
batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt"
|
||||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
# or local:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -181,23 +226,30 @@ def main():
|
|||
|
||||
# write it!
|
||||
if args.length:
|
||||
writeshortsentence(text_model, args.sentences,
|
||||
args.outfile, args.overlap, args.length)
|
||||
writeshortsentence(
|
||||
text_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
else:
|
||||
writesentence(text_model, args.sentences,
|
||||
args.outfile, args.overlap, args.length)
|
||||
writesentence(
|
||||
text_model, args.sentences, args.outfile, args.overlap, args.length
|
||||
)
|
||||
|
||||
print('\n: :\n')
|
||||
print("\n: :\n")
|
||||
for key, value in vars(args).items():
|
||||
print(': ' + key.ljust(15, ' ') + ': ' + str(value).ljust(10))
|
||||
print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10))
|
||||
if os.path.isfile(args.outfile):
|
||||
print("\n: literary genius has been written to the file "
|
||||
+ args.outfile + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'")
|
||||
print(
|
||||
"\n: literary genius has been written to the file "
|
||||
+ args.outfile
|
||||
+ ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'"
|
||||
)
|
||||
else:
|
||||
print(': mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!')
|
||||
print(
|
||||
": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!"
|
||||
)
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
48
setup.py
48
setup.py
|
@ -2,32 +2,28 @@ from setuptools import setup, find_packages
|
|||
|
||||
# read the contents of your README file as long_description:
|
||||
from os import path
|
||||
|
||||
this_directory = path.abspath(path.dirname(__file__))
|
||||
with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
|
||||
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
||||
long_description = f.read()
|
||||
|
||||
setup(name='mkv-this',
|
||||
version='0.1.43',
|
||||
description='cli wrapper for markovify: take a text file or URL, markovify, save the results.',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
url='https://git.disroot.org/mousebot/mkv-this',
|
||||
author='mousebot',
|
||||
author_email='martianhiatus@riseup.net',
|
||||
license='AGPLv3',
|
||||
packages=find_packages(),
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'mkv-this = mkv_this.mkv_this:main',
|
||||
'mkv-this-dir = mkv_this.mkv_this_dir:main'
|
||||
]
|
||||
},
|
||||
install_requires=[
|
||||
'markovify',
|
||||
'argparse',
|
||||
'html2text',
|
||||
'requests',
|
||||
# 'bs4',
|
||||
],
|
||||
zip_safe=False,
|
||||
)
|
||||
setup(
|
||||
name="mkv-this",
|
||||
version="0.1.43",
|
||||
description="cli wrapper for markovify: take a text file or URL, markovify, save the results.",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://git.disroot.org/mousebot/mkv-this",
|
||||
author="mousebot",
|
||||
author_email="martianhiatus@riseup.net",
|
||||
license="AGPLv3",
|
||||
packages=find_packages(),
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"mkv-this = mkv_this.mkv_this:main",
|
||||
"mkv-this-dir = mkv_this.mkv_this_dir:main",
|
||||
]
|
||||
},
|
||||
install_requires=["markovify", "argparse", "html2text", "requests",],
|
||||
zip_safe=False,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue