switch no_well_formed to well_formed
This commit is contained in:
parent
7a19d6402c
commit
d06f84a5f1
|
@ -24,6 +24,7 @@
|
|||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import markovify
|
||||
import sys
|
||||
|
@ -58,8 +59,7 @@ def parse_the_args():
|
|||
# switches
|
||||
parser.add_argument(
|
||||
'-u', '--URL', help="infile is a URL instead.", action='store_true')
|
||||
parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed': allow the inclusion of sentences containing []{}()""'' in the markov model. might filth up your text, eg if it contains 'smart' quotes.", action='store_false')
|
||||
# store_false = default to True.
|
||||
parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True.
|
||||
parser.add_argument(
|
||||
'--newline', help="sentences in input file end with newlines \
|
||||
rather than full stops.", action='store_true')
|
||||
|
@ -85,10 +85,16 @@ def URL(insert):
|
|||
def convert_html(html):
|
||||
h2t = html2text.HTML2Text()
|
||||
h2t.ignore_links = True
|
||||
h2t.ignore_images = True
|
||||
h2t.images_to_alt = True
|
||||
h2t.ignore_emphasis = True
|
||||
h2t.ignore_tables = True
|
||||
h2t.unicode_snob = True
|
||||
h2t.decode_errors = 'ignore'
|
||||
h2t.escape_all = False # remove all noise if needed
|
||||
print(': URL converted to text')
|
||||
return h2t.handle(html)
|
||||
s = h2t.handle(html)
|
||||
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
|
||||
return s
|
||||
|
||||
|
||||
def read(infile):
|
||||
|
@ -105,12 +111,12 @@ def read(infile):
|
|||
|
||||
def mkbtext(texttype):
|
||||
return markovify.Text(texttype, state_size=args.state_size,
|
||||
well_formed=args.no_well_formed)
|
||||
well_formed=args.well_formed)
|
||||
|
||||
|
||||
def mkbnewline(texttype):
|
||||
return markovify.NewlineText(texttype, state_size=args.state_size,
|
||||
well_formed=args.no_well_formed)
|
||||
well_formed=args.well_formed)
|
||||
|
||||
|
||||
def writesentence(tmodel):
|
||||
|
@ -146,7 +152,7 @@ def main():
|
|||
if args.URL:
|
||||
html = URL(args.infile)
|
||||
text = convert_html(html)
|
||||
# or normal:
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
# read -c file:
|
||||
|
|
|
@ -22,6 +22,7 @@ a (very basic) script to collect all text files in a directory, markovify them a
|
|||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import markovify
|
||||
import sys
|
||||
import argparse
|
||||
|
@ -46,14 +47,14 @@ def parse_the_args():
|
|||
parser.add_argument('-w', '--weight', help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", type=float, default=1)
|
||||
|
||||
# switches
|
||||
parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed', ie allow the inclusion of sentences with []{}()""'' in them in the markov model. this might filth up your text, especially if it contains 'smart' quotes.", action='store_false')
|
||||
parser.add_argument('-f', '--well-formed', help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", action='store_true')
|
||||
# store_false = default to True.
|
||||
parser.add_argument('--newline', help="sentences in input file end with newlines rather than with full stops.", action='store_true')
|
||||
# store_true = default to False, become True if flagged.
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
# retch, read, build, write fns:
|
||||
# fetch/read/build/write fns:
|
||||
|
||||
|
||||
def URL(insert):
|
||||
|
@ -71,10 +72,16 @@ def URL(insert):
|
|||
def convert_html(html):
|
||||
h2t = html2text.HTML2Text()
|
||||
h2t.ignore_links = True
|
||||
h2t.ignore_images = True
|
||||
h2t.images_to_alt = True
|
||||
h2t.ignore_emphasis = True
|
||||
h2t.ignore_tables = True
|
||||
h2t.unicode_snob = True
|
||||
h2t.decode_errors = 'ignore'
|
||||
h2t.escape_all = False # remove all noise if needed
|
||||
print(': URL converted to text')
|
||||
return h2t.handle(html)
|
||||
s = h2t.handle(html)
|
||||
s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
|
||||
return s
|
||||
|
||||
|
||||
def read(infile):
|
||||
|
@ -90,11 +97,11 @@ def read(infile):
|
|||
|
||||
def mkbtext(texttype):
|
||||
return markovify.Text(texttype, state_size=args.state_size,
|
||||
well_formed=args.no_well_formed)
|
||||
well_formed=args.well_formed)
|
||||
|
||||
def mkbnewline(texttype):
|
||||
return markovify.NewlineText(texttype, state_size=args.state_size,
|
||||
well_formed=args.no_well_formed)
|
||||
well_formed=args.well_formed)
|
||||
|
||||
def writesentence(tmodel):
|
||||
for i in range(args.sentences):
|
||||
|
|
Loading…
Reference in New Issue