diff --git a/mkv_this/mkv_this.py b/mkv_this/mkv_this.py index 723c6f5..4551477 100755 --- a/mkv_this/mkv_this.py +++ b/mkv_this/mkv_this.py @@ -24,6 +24,7 @@ """ import os +import re import requests import markovify import sys @@ -58,8 +59,7 @@ def parse_the_args(): # switches parser.add_argument( '-u', '--URL', help="infile is a URL instead.", action='store_true') - parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed': allow the inclusion of sentences containing []{}()""'' in the markov model. might filth up your text, eg if it contains 'smart' quotes.", action='store_false') - # store_false = default to True. + parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True. parser.add_argument( '--newline', help="sentences in input file end with newlines \ rather than full stops.", action='store_true') @@ -85,10 +85,16 @@ def URL(insert): def convert_html(html): h2t = html2text.HTML2Text() h2t.ignore_links = True - h2t.ignore_images = True + h2t.images_to_alt = True h2t.ignore_emphasis = True + h2t.ignore_tables = True + h2t.unicode_snob = True + h2t.decode_errors = 'ignore' + h2t.escape_all = False # remove all noise if needed print(': URL converted to text') - return h2t.handle(html) + s = h2t.handle(html) + s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown' + return s def read(infile): @@ -105,12 +111,12 @@ def read(infile): def mkbtext(texttype): return markovify.Text(texttype, state_size=args.state_size, - well_formed=args.no_well_formed) + well_formed=args.well_formed) def mkbnewline(texttype): return markovify.NewlineText(texttype, state_size=args.state_size, - well_formed=args.no_well_formed) + well_formed=args.well_formed) def writesentence(tmodel): @@ -146,7 +152,7 @@ def main(): if args.URL: html = URL(args.infile) text = convert_html(html) - # or normal: + # or normal: else: text = read(args.infile) # read -c file: diff --git a/mkv_this/mkv_this_dir.py b/mkv_this/mkv_this_dir.py index 943f8b0..531740f 100644 --- a/mkv_this/mkv_this_dir.py +++ b/mkv_this/mkv_this_dir.py @@ -22,6 +22,7 @@ a (very basic) script to collect all text files in a directory, markovify them a """ import os +import re import markovify import sys import argparse @@ -46,14 +47,14 @@ def parse_the_args(): parser.add_argument('-w', '--weight', help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", type=float, default=1) # switches - parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed', ie allow the inclusion of sentences with []{}()""'' in them in the markov model. this might filth up your text, especially if it contains 'smart' quotes.", action='store_false') + parser.add_argument('-f', '--well-formed', help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True. parser.add_argument('--newline', help="sentences in input file end with newlines rather than with full stops.", action='store_true') # store_true = default to False, become True if flagged. return parser.parse_args() -# retch, read, build, write fns: +# fetch/read/build/write fns: def URL(insert): @@ -71,10 +72,16 @@ def URL(insert): def convert_html(html): h2t = html2text.HTML2Text() h2t.ignore_links = True - h2t.ignore_images = True + h2t.images_to_alt = True h2t.ignore_emphasis = True + h2t.ignore_tables = True + h2t.unicode_snob = True + h2t.decode_errors = 'ignore' + h2t.escape_all = False # remove all noise if needed print(': URL converted to text') - return h2t.handle(html) + s = h2t.handle(html) + s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown' + return s def read(infile): @@ -90,11 +97,11 @@ def read(infile): def mkbtext(texttype): return markovify.Text(texttype, state_size=args.state_size, - well_formed=args.no_well_formed) + well_formed=args.well_formed) def mkbnewline(texttype): return markovify.NewlineText(texttype, state_size=args.state_size, - well_formed=args.no_well_formed) + well_formed=args.well_formed) def writesentence(tmodel): for i in range(args.sentences):