switch no_well_formed to well_formed

This commit is contained in:
mousebot 2020-04-24 21:44:51 -03:00
parent 7a19d6402c
commit d06f84a5f1
2 changed files with 26 additions and 13 deletions

View File

@ -24,6 +24,7 @@
"""
import os
import re
import requests
import markovify
import sys
@ -58,8 +59,7 @@ def parse_the_args():
# switches
parser.add_argument(
'-u', '--URL', help="infile is a URL instead.", action='store_true')
parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed': allow the inclusion of sentences containing []{}()""'' in the markov model. might filth up your text, eg if it contains 'smart' quotes.", action='store_false')
# store_false = default to True.
parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True.
parser.add_argument(
'--newline', help="sentences in input file end with newlines \
rather than full stops.", action='store_true')
@ -85,10 +85,16 @@ def URL(insert):
def convert_html(html):
h2t = html2text.HTML2Text()
h2t.ignore_links = True
h2t.ignore_images = True
h2t.images_to_alt = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = True
h2t.decode_errors = 'ignore'
h2t.escape_all = False # remove all noise if needed
print(': URL converted to text')
return h2t.handle(html)
s = h2t.handle(html)
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
return s
def read(infile):
@ -105,12 +111,12 @@ def read(infile):
def mkbtext(texttype):
return markovify.Text(texttype, state_size=args.state_size,
well_formed=args.no_well_formed)
well_formed=args.well_formed)
def mkbnewline(texttype):
return markovify.NewlineText(texttype, state_size=args.state_size,
well_formed=args.no_well_formed)
well_formed=args.well_formed)
def writesentence(tmodel):
@ -146,7 +152,7 @@ def main():
if args.URL:
html = URL(args.infile)
text = convert_html(html)
# or normal:
# or normal:
else:
text = read(args.infile)
# read -c file:

View File

@ -22,6 +22,7 @@ a (very basic) script to collect all text files in a directory, markovify them a
"""
import os
import re
import markovify
import sys
import argparse
@ -46,14 +47,14 @@ def parse_the_args():
parser.add_argument('-w', '--weight', help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", type=float, default=1)
# switches
parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed', ie allow the inclusion of sentences with []{}()""'' in them in the markov model. this might filth up your text, especially if it contains 'smart' quotes.", action='store_false')
parser.add_argument('-f', '--well-formed', help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", action='store_true')
# store_false = default to True.
parser.add_argument('--newline', help="sentences in input file end with newlines rather than with full stops.", action='store_true')
# store_true = default to False, become True if flagged.
return parser.parse_args()
# retch, read, build, write fns:
# fetch/read/build/write fns:
def URL(insert):
@ -71,10 +72,16 @@ def URL(insert):
def convert_html(html):
h2t = html2text.HTML2Text()
h2t.ignore_links = True
h2t.ignore_images = True
h2t.images_to_alt = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = True
h2t.decode_errors = 'ignore'
h2t.escape_all = False # remove all noise if needed
print(': URL converted to text')
return h2t.handle(html)
s = h2t.handle(html)
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
return s
def read(infile):
@ -90,11 +97,11 @@ def read(infile):
def mkbtext(texttype):
return markovify.Text(texttype, state_size=args.state_size,
well_formed=args.no_well_formed)
well_formed=args.well_formed)
def mkbnewline(texttype):
return markovify.NewlineText(texttype, state_size=args.state_size,
well_formed=args.no_well_formed)
well_formed=args.well_formed)
def writesentence(tmodel):
for i in range(args.sentences):