mkv-this/mkv_this/functions.py

189 lines
5.6 KiB
Python

import os
import re
import requests
import markovify
import sys
import html2text
fnf = ": error: file not found. please provide a path to a really-existing file!"
def url(insert):
""" fetch a webpage, return it as html """
try:
req = requests.get(insert)
req.raise_for_status()
req.encoding = req.apparent_encoding
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
except Exception as exc:
print(f": there was trouble: {exc}.\n: please enter a valid url.")
sys.exit()
else:
print(": fetched html from url.")
return req.text
def convert_html(html):
""" convert a html page to plain text """
h2t = html2text.HTML2Text()
h2t.images_to_alt = True
h2t.ignore_links = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = False
h2t.decode_errors = "replace"
h2t.escape_all = False # remove all noise if needed
s = h2t.handle(html)
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
print(": html converted to plain text")
return s
def read(infile):
""" read a file so its ready for markov """
try:
if infile.lower().endswith(".pdf"):
print(
"looks like you entered a pdf file. you need to use the '-P' flag to convert it."
)
sys.exit()
else:
with open(infile, encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(infile, encoding="latin-1") as f:
return f.read()
except IsADirectoryError as exc:
print(
f": there was trouble: {exc}.\n: looks like you entered a directory. use '-d' for that."
)
sys.exit()
except FileNotFoundError:
print(fnf)
sys.exit()
def mkbtext(texttype, args_ss, args_wf):
""" build a markov model """
return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)
def mkbnewline(texttype, args_ss, args_wf):
""" build a markov model, newline """
return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)
def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya, appended to outfile, short sentence """
output = open(args_out, "a") # append
for i in range(args_sen):
output.write(
"\n"
+ str(
tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
def writesentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya, appendended to outfile """
output = open(args_out, "a") # append
for i in range(args_sen):
output.write(
str(
tmodel.make_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
# functions for args.directory:
def dir_list(directory):
""" returns a list of all text files from a directory"""
# create a list of files to concatenate:
matches = []
if os.path.isdir(directory) is True:
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith((".txt", ".org", ".md")):
matches.append(os.path.join(root, filename))
print(": text files fetched and combined")
else:
print(": error: please enter a valid directory")
sys.exit()
return matches
def dir_cat(matchlist, bulkfile):
""" takes a list of files, returns single concatenated file """
# concatenate into batchfile.txt:
with open(bulkfile, "w") as outfile:
for fname in matchlist:
try:
with open(fname, encoding="utf-8") as infile:
outfile.write(infile.read())
except UnicodeDecodeError:
with open(fname, encoding="latin-1") as infile:
outfile.write(infile.read())
# extract full text from a pdf:
def convert_pdf(path):
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
except ModuleNotFoundError as exc:
print(
f": there was trouble: {exc}.\n: install 'pdfminer.six' with pip to convert a pdf."
)
sys.exit()
print(": converting pdf file...")
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, "rb")
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
except Exception as exc:
print(f": there was trouble: {exc}.\n: please enter a valid pdf")
sys.exit()
else:
print(": pdf converted.")
return text