switch no_well_formed to well_formed

2020-04-24 21:44:51 -03:00 · 2020-04-24 21:44:51 -03:00 · d06f84a5f1
parent 7a19d6402c
commit d06f84a5f1
2 changed files with 26 additions and 13 deletions
--- a/mkv_this/mkv_this.py
+++ b/mkv_this/mkv_this.py
@ -24,6 +24,7 @@
 """

 import os
+import re
 import requests
 import markovify
 import sys
@ -58,8 +59,7 @@ def parse_the_args():
    # switches
    parser.add_argument(
        '-u', '--URL', help="infile is a URL instead.", action='store_true')
-    parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed': allow the inclusion of sentences containing []{}()""'' in the markov model. might filth up your text, eg if it contains 'smart' quotes.", action='store_false')
-    # store_false = default to True.
+    parser.add_argument('-f', '--well-formed', help="enforce 'well_formed': discard sentences containing []{}()""'' from the markov model. use if output is filthy.", action='store_true') # store_false = default to True.
    parser.add_argument(
        '--newline', help="sentences in input file end with newlines \
        rather than full stops.", action='store_true')
@ -85,10 +85,16 @@ def URL(insert):
 def convert_html(html):
    h2t = html2text.HTML2Text()
    h2t.ignore_links = True
-    h2t.ignore_images = True
+    h2t.images_to_alt = True
    h2t.ignore_emphasis = True
+    h2t.ignore_tables = True
+    h2t.unicode_snob = True
+    h2t.decode_errors = 'ignore'
+    h2t.escape_all = False # remove all noise if needed
    print(': URL converted to text')
-    return h2t.handle(html)
+    s = h2t.handle(html)
+    s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
+    return s


 def read(infile):
@ -105,12 +111,12 @@ def read(infile):

 def mkbtext(texttype):
    return markovify.Text(texttype, state_size=args.state_size,
-                          well_formed=args.no_well_formed)
+                          well_formed=args.well_formed)


 def mkbnewline(texttype):
    return markovify.NewlineText(texttype, state_size=args.state_size,
-                                 well_formed=args.no_well_formed)
+                                 well_formed=args.well_formed)


 def writesentence(tmodel):
@ -146,7 +152,7 @@ def main():
            if args.URL:
                html = URL(args.infile)
                text = convert_html(html)
-            # or normal:
+                # or normal:
            else:
                text = read(args.infile)
            # read -c file:
--- a/mkv_this/mkv_this_dir.py
+++ b/mkv_this/mkv_this_dir.py
@ -22,6 +22,7 @@ a (very basic) script to collect all text files in a directory, markovify them a
 """

 import os
+import re
 import markovify
 import sys
 import argparse
@ -46,14 +47,14 @@ def parse_the_args():
    parser.add_argument('-w', '--weight', help="specify the weight to be given to the second text provided with --combine. defaults to 1, and the weight of the initial text is also 1. setting this to 1.5 will place 50 percent more weight on the second text. setting it to 0.5 will place less.", type=float, default=1)

    # switches
-    parser.add_argument('-f', '--no-well-formed', help="don't enforce 'well_formed', ie allow the inclusion of sentences with []{}()""'' in them in the markov model. this might filth up your text, especially if it contains 'smart' quotes.", action='store_false')
+    parser.add_argument('-f', '--well-formed', help="enforce 'well_formed', doscard sentences with []{}()""'' from the markov model. use if output is filthy.", action='store_true')
    # store_false = default to True.
    parser.add_argument('--newline', help="sentences in input file end with newlines rather than with full stops.", action='store_true')
    # store_true = default to False, become True if flagged.

    return parser.parse_args()

-# retch, read, build, write fns:
+# fetch/read/build/write fns:


 def URL(insert):
@ -71,10 +72,16 @@ def URL(insert):
 def convert_html(html):
    h2t = html2text.HTML2Text()
    h2t.ignore_links = True
-    h2t.ignore_images = True
+    h2t.images_to_alt = True
    h2t.ignore_emphasis = True
+    h2t.ignore_tables = True
+    h2t.unicode_snob = True
+    h2t.decode_errors = 'ignore'
+    h2t.escape_all = False # remove all noise if needed
    print(': URL converted to text')
-    return h2t.handle(html)
+    s = h2t.handle(html)
+    s = re.sub('[#*]', '', s) # remove hashes and stars from the 'markdown'
+    return s


 def read(infile):
@ -90,11 +97,11 @@ def read(infile):

 def mkbtext(texttype):
    return markovify.Text(texttype, state_size=args.state_size,
-                          well_formed=args.no_well_formed)
+                          well_formed=args.well_formed)

 def mkbnewline(texttype):
    return markovify.NewlineText(texttype, state_size=args.state_size,
-                          well_formed=args.no_well_formed)
+                          well_formed=args.well_formed)

 def writesentence(tmodel):
    for i in range(args.sentences):