diff --git a/mkv_this/functions.py b/mkv_this/functions.py index 0ea4be2..8e7af49 100644 --- a/mkv_this/functions.py +++ b/mkv_this/functions.py @@ -3,10 +3,10 @@ import re import requests import markovify import sys -import argparse import html2text -fnf = ': error: file not found. please provide a path to a really-existing file!' + +fnf = ": error: file not found. please provide a path to a really-existing file!" def URL(insert): @@ -14,11 +14,13 @@ def URL(insert): try: req = requests.get(insert) req.raise_for_status() + req.encoding = req.apparent_encoding + # try to fix encoding issues except Exception as exc: - print(f': There was a problem: {exc}.\n: Please enter a valid URL') + print(f": There was a problem: {exc}.\n: Please enter a valid URL") sys.exit() else: - print(': fetched URL.') + print(": fetched URL.") return req.text @@ -30,11 +32,11 @@ def convert_html(html): h2t.ignore_emphasis = True h2t.ignore_tables = True h2t.unicode_snob = True - h2t.decode_errors = 'ignore' - h2t.escape_all = False # remove all noise if needed - print(': URL converted to text') + h2t.decode_errors = "ignore" + h2t.escape_all = False # remove all noise if needed + print(": URL converted to text") s = h2t.handle(html) - s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown' + s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown' return s @@ -46,6 +48,11 @@ def read(infile): except UnicodeDecodeError: with open(infile, encoding="latin-1") as f: return f.read() + except IsADirectoryError as exc: + print( + f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that." + ) + sys.exit() except FileNotFoundError: print(fnf) sys.exit() @@ -53,92 +60,70 @@ def read(infile): def mkbtext(texttype, args_ss, args_wf): """ build a markov model """ - return markovify.Text(texttype, state_size=args_ss, - well_formed=args_wf) + return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf) def mkbnewline(texttype, args_ss, args_wf): """ build a markov model, newline """ - return markovify.NewlineText(texttype, state_size=args_ss, - well_formed=args_wf) + return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf) def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len): """ actually make the damn litter-atchya """ for i in range(args_sen): - output = open(args_out, 'a') # append - output.write(str(tmodel.make_short_sentence( - tries=2000, max_overlap_ratio=args_over, - max_chars=args_len)) + '\n\n') - output.write(str('*\n\n')) + output = open(args_out, "a") # append + output.write( + str( + tmodel.make_short_sentence( + tries=2000, max_overlap_ratio=args_over, max_chars=args_len + ) + ) + + "\n\n" + ) + output.write(str("*\n\n")) output.close() def writesentence(tmodel, args_sen, args_out, args_over, args_len): """ actually make the damn litter-atchya, and short """ for i in range(args_sen): - output = open(args_out, 'a') # append - output.write(str(tmodel.make_sentence( - tries=2000, max_overlap_ratio=args_over, - max_chars=args_len)) + '\n\n') - output.write(str('*\n\n')) + output = open(args_out, "a") # append + output.write( + str( + tmodel.make_sentence( + tries=2000, max_overlap_ratio=args_over, max_chars=args_len + ) + ) + + "\n\n" + ) + output.write(str("*\n\n")) output.close() -### functions for mkv_this_scr.py - -def get_urls(st_url): - """ fetch a bunch of article URLs from The Guardian world news page for a given date. Format: 'https://theguardian.com/cat/YEAR/mth/xx' """ - try: - req = requests.get(st_url) - req.raise_for_status() - except Exception as exc: - print(f': There was a problem: {exc}.\n: Please enter a valid URL') - sys.exit() +# functions for directory: +def dir_list(directory): + # create a list of files to concatenate: + matches = [] + if os.path.isdir(directory) is True: + for root, dirnames, filenames in os.walk(directory): + for filename in filenames: + if filename.endswith((".txt", ".org", ".md")): + matches.append(os.path.join(root, filename)) + print(": text files fetched and combined") else: - print(': fetched initial URL.') - soup = bs4.BeautifulSoup(req.text, "lxml") - art_elem = soup.select('div[class="fc-item__header"] a[data-link-name="article"]') # pull the element containing article links. - urls = [] - for i in range(len(art_elem)): - urls = urls + [art_elem[i].attrs['href']] - print(': fetched list of URLs') - return urls # returns a LIST - - -def scr_URLs(urls): # input a LIST - """ actually fetch all the URLs obtained by get_urls """ - try: - content = [] - for i in range(len(urls)): - req = requests.get(urls[i]) - req.raise_for_status() - content = content + [req.text] # SUPER slow. - print(': fetched page ' + urls[i]) - except Exception as exc: - print(f': There was a problem: {exc}.\n: There was trouble in your list of URLs') + print(": error: please enter a valid directory") sys.exit() - else: - print(': fetched all pages.') - return content + return matches # returns a LIST of filenames - -def scr_convert_html(content): # takes a LIST of html pages - """ convert all pages obtained by scr_URLs """ - h2t = html2text.HTML2Text() - h2t.ignore_links = True - h2t.images_to_alt = True - h2t.ignore_emphasis = True - h2t.ignore_tables = True - h2t.unicode_snob = True - h2t.decode_errors = 'ignore' - h2t.escape_all = False # remove all noise if needed - s = [] - for i in range(len(content)): - s = s + [h2t.handle(content[i])] # convert - t = [] - for i in range(len(s)): - t = t + [re.sub('[#*]', '', s[i])] # remove hash/star from the 'markdown' - u = ' '.join(t) # convert list to string - print(': Pages converted to text') - return u + +# feed this one the matches list: +def dir_cat(matchlist, batchfile): + # concatenate into batchfile.txt: + with open(batchfile, "w") as outfile: + for fname in matchlist: + try: + with open(fname, encoding="utf-8") as infile: + outfile.write(infile.read()) + except UnicodeDecodeError: + with open(fname, encoding="latin-1") as infile: + outfile.write(infile.read()) diff --git a/mkv_this/mkv_this.py b/mkv_this/mkv_this.py index 9a0df94..b03a396 100755 --- a/mkv_this/mkv_this.py +++ b/mkv_this/mkv_this.py @@ -20,47 +20,106 @@ """ -import re -import requests import markovify -import html2text import os import sys import argparse -from functions import URL, convert_html, read, mkbtext, mkbnewline, writesentence, writeshortsentence +from .functions import ( + URL, + convert_html, + dir_list, + dir_cat, + read, + mkbtext, + mkbnewline, + writesentence, + writeshortsentence, +) # argparse def parse_the_args(): - parser = argparse.ArgumentParser(prog="mkv-this", description="markovify local text files or URLs and output the results to a local text file.", - epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.") + parser = argparse.ArgumentParser( + prog="mkv-this", + description="markovify local text files or URLs and output the results to a local text file.", + epilog="may you find many prophetic énoncés in your virtual bird guts! Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial.", + ) # positional args: parser.add_argument( - 'infile', help="the text file to process. NB: file cannot be empty.") - parser.add_argument('outfile', nargs='?', default="./mkv-output.txt", - help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.") + "infile", help="the text file to process. NB: file cannot be empty." + ) + parser.add_argument( + "outfile", + nargs="?", + default="./mkv-output.txt", + help="the file to save to. if the file is used more than once, subsequent literature will be appended to it. defaults to ./mkv-output.txt.", + ) # optional args: - parser.add_argument('-s', '--state-size', help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", type=int, default=2) parser.add_argument( - '-n', '--sentences', help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", type=int, default=5) + "-s", + "--state-size", + help="the number of preceeding words used to calculate the probability of the next word. defaults to 2, 1 makes it more random, 3 less so. > 4 will likely have little effect.", + type=int, + default=2, + ) parser.add_argument( - '-l', '--length', help="set maximum number of characters per sentence.", type=int) + "-n", + "--sentences", + help="the number of 'sentences' to output. defaults to 5. NB: if your text has no initial caps, a 'sentence' will be a paragraph.", + type=int, + default=5, + ) parser.add_argument( - '-o', '--overlap', help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", type=float, default=0.5) + "-l", + "--length", + help="set maximum number of characters per sentence.", + type=int, + ) parser.add_argument( - '-c', '--combine', help="provide an another text file to be combined with the first item.") - parser.add_argument('-C', '--combine-URL', - help="provide a URL to be combined with the first item") - parser.add_argument('-w', '--weight', help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", type=float, default=1) + "-o", + "--overlap", + help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", + type=float, + default=0.5, + ) + parser.add_argument( + "-c", + "--combine", + help="provide an another text file to be combined with the first item.", + ) + parser.add_argument( + "-C", "--combine-URL", help="provide a URL to be combined with the first item" + ) + parser.add_argument( + "-w", + "--weight", + help="specify the weight to be given to the text provided with -c or -C. defaults to 1, and the weight of the initial text is 1. 1.5 will place more weight on the second text, 0.5 will place less.", + type=float, + default=1, + ) # switches parser.add_argument( - '-u', '--URL', help="infile is a URL instead.", action='store_true') - parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True. + "-u", "--URL", help="infile is a URL instead.", action="store_true" + ) parser.add_argument( - '--newline', help="sentences in input file end with newlines \ - rather than full stops.", action='store_true') + "-d", "--directory", help="infile is a directory instead.", action="store_true" + ) + parser.add_argument( + "-f", + "--well-formed", + help="enforce 'well_formed': discard sentences containing []{}()" + "'' from the markov model. use if output is filthy.", + action="store_true", + ) + # store_true = default to False. + parser.add_argument( + "--newline", + help="sentences in input file end with newlines \ + rather than full stops.", + action="store_true", + ) # store_true = default to False, become True if flagged. return parser.parse_args() @@ -79,7 +138,15 @@ def main(): if args.URL: html = URL(args.infile) text = convert_html(html) - # or normal: + # infile is dir: + elif args.directory: + matchlist = dir_list(args.infile) + # place batchfile.txt in user-given directory: + batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt" + dir_cat(matchlist, batchfile) + text = read(batchfile) + os.unlink(batchfile) + # or normal: else: text = read(args.infile) # read -c file: @@ -91,7 +158,14 @@ def main(): if args.URL: html = URL(args.infile) text = convert_html(html) - # or normal: + # infile is dir: + elif args.directory: + matchlist = dir_list(args.infile) + # place batchfile.txt in user-given directory: + batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt" + dir_cat(matchlist, batchfile) + text = read(batchfile) + # or normal: else: text = read(args.infile) # now combine_URL: @@ -108,14 +182,17 @@ def main(): text_model = mkbtext(text, args.state_size, args.well_formed) ctext_model = mkbtext(ctext, args.state_size, args.well_formed) - combo_model = markovify.combine( - [text_model, ctext_model], [1, args.weight]) + combo_model = markovify.combine([text_model, ctext_model], [1, args.weight]) # write it combo! if args.length: - writeshortsentence(combo_model, args.sentences, args.outfile, args.overlap, args.length) + writeshortsentence( + combo_model, args.sentences, args.outfile, args.overlap, args.length + ) else: - writesentence(combo_model, args.sentences, args.outfile, args.overlap, args.length) + writesentence( + combo_model, args.sentences, args.outfile, args.overlap, args.length + ) # if no -c/-C, do normal: else: @@ -124,6 +201,12 @@ def main(): if args.URL: html = URL(args.infile) text = convert_html(html) + elif args.directory: + matchlist = dir_list(args.infile) + # place batchfile.txt in user-given directory: + batchfile = os.path.dirname(args.infile) + os.path.sep + "batchfile.txt" + dir_cat(matchlist, batchfile) + text = read(batchfile) # or local: else: text = read(args.infile) @@ -138,21 +221,30 @@ def main(): # write it! if args.length: - writeshortsentence(text_model, args.sentences, args.outfile, args.overlap, args.length) + writeshortsentence( + text_model, args.sentences, args.outfile, args.overlap, args.length + ) else: - writesentence(text_model, args.sentences, args.outfile, args.overlap, args.length) + writesentence( + text_model, args.sentences, args.outfile, args.overlap, args.length + ) - print('\n: :\n') + print("\n: :\n") for key, value in vars(args).items(): - print(': ' + key.ljust(15, ' ') + ': ' + str(value).ljust(10)) + print(": " + key.ljust(15, " ") + ": " + str(value).ljust(10)) if os.path.isfile(args.outfile): - print("\n: literary genius has been written to the file " - + args.outfile + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'") + print( + "\n: literary genius has been written to the file " + + args.outfile + + ". thanks for playing!\n\n: 'Here, this is not at all the becomings that are connected... so if you want to edit it like a bot yourself, it is trivial. Yes, although your very smile suggests that this Armenian enclave is not at all the becomings that are connected...'" + ) else: - print(': mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!') + print( + ": mkv-this ran but did NOT create an output file as requested. this is a very regrettable and dangerous situation. contact the package maintainer asap. soz!" + ) sys.exit() -if __name__ == '__main__': +if __name__ == "__main__": main()