mkv-this/mkv_this/functions.py

130 lines
3.8 KiB
Python

import os
import re
import requests
import markovify
import sys
import html2text
fnf = ": error: file not found. please provide a path to a really-existing file!"
def URL(insert):
""" fetch a url """
try:
req = requests.get(insert)
req.raise_for_status()
req.encoding = req.apparent_encoding
# try to fix encoding issues
except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid URL")
sys.exit()
else:
print(": fetched URL.")
return req.text
def convert_html(html):
""" convert a fetched page to text """
h2t = html2text.HTML2Text()
h2t.ignore_links = True
h2t.images_to_alt = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = True
h2t.decode_errors = "ignore"
h2t.escape_all = False # remove all noise if needed
print(": URL converted to text")
s = h2t.handle(html)
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
return s
def read(infile):
""" read your (local) file for the markov model """
try:
with open(infile, encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(infile, encoding="latin-1") as f:
return f.read()
except IsADirectoryError as exc:
print(
f": There was a problem: {exc}.\n: Looks like you entered a directory. Use '-d' for that."
)
sys.exit()
except FileNotFoundError:
print(fnf)
sys.exit()
def mkbtext(texttype, args_ss, args_wf):
""" build a markov model """
return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)
def mkbnewline(texttype, args_ss, args_wf):
""" build a markov model, newline """
return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)
def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya """
for i in range(args_sen):
output = open(args_out, "a") # append
output.write(
str(
tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
def writesentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya, and short """
for i in range(args_sen):
output = open(args_out, "a") # append
output.write(
str(
tmodel.make_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
# functions for directory:
def dir_list(directory):
# create a list of files to concatenate:
matches = []
if os.path.isdir(directory) is True:
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith((".txt", ".org", ".md")):
matches.append(os.path.join(root, filename))
print(": text files fetched and combined")
else:
print(": error: please enter a valid directory")
sys.exit()
return matches # returns a LIST of filenames
# feed this one the matches list:
def dir_cat(matchlist, batchfile):
# concatenate into batchfile.txt:
with open(batchfile, "w") as outfile:
for fname in matchlist:
try:
with open(fname, encoding="utf-8") as infile:
outfile.write(infile.read())
except UnicodeDecodeError:
with open(fname, encoding="latin-1") as infile:
outfile.write(infile.read())