mkv-this/mkv_this/functions.py

184 lines
5.4 KiB
Python

import os
import re
import requests
import markovify
import sys
import html2text
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
fnf = ": error: file not found. please provide a path to a really-existing file!"
def url(insert):
""" fetch a webpage, return it as html """
try:
req = requests.get(insert)
req.raise_for_status()
req.encoding = req.apparent_encoding
# use chardet to catch encoding issue with ISO-8859-1/Latin-1.
except Exception as exc:
print(f": there was trouble: {exc}.\n: please enter a valid url.")
sys.exit()
else:
print(": fetched html from url.")
return req.text
def convert_html(html):
""" convert a html page to plain text """
h2t = html2text.HTML2Text()
h2t.images_to_alt = True
h2t.ignore_links = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True
h2t.unicode_snob = False
h2t.decode_errors = "replace"
h2t.escape_all = False # remove all noise if needed
s = h2t.handle(html)
s = re.sub("[#*]", "", s) # remove hashes and stars from the 'markdown'
print(": html converted to plain text")
return s
def read(infile):
""" read a file so its ready for markov """
try:
if infile.lower().endswith(".pdf"):
print(
"looks like you entered a pdf file. you need to use the '-P' flag to convert it."
)
sys.exit()
else:
with open(infile, encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(infile, encoding="latin-1") as f:
return f.read()
except IsADirectoryError as exc:
print(
f": there was trouble: {exc}.\n: looks like you entered a directory. Use '-d' for that."
)
sys.exit()
except FileNotFoundError:
print(fnf)
sys.exit()
def mkbtext(texttype, args_ss, args_wf):
""" build a markov model """
return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)
def mkbnewline(texttype, args_ss, args_wf):
""" build a markov model, newline """
return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)
def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya, appended to outfile, short sentence """
output = open(args_out, "a") # append
for i in range(args_sen):
output.write(
"\n"
+ str(
tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
def writesentence(tmodel, args_sen, args_out, args_over, args_len):
""" actually make the damn litter-atchya, appendended to outfile """
output = open(args_out, "a") # append
for i in range(args_sen):
output.write(
str(
tmodel.make_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
)
+ "\n\n"
)
output.write(str("*\n\n"))
output.close()
# functions for args.directory:
def dir_list(directory):
""" returns a list of all text files from a directory"""
# create a list of files to concatenate:
matches = []
if os.path.isdir(directory) is True:
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith((".txt", ".org", ".md")):
matches.append(os.path.join(root, filename))
print(": text files fetched and combined")
else:
print(": error: please enter a valid directory")
sys.exit()
return matches
def dir_cat(matchlist, bulkfile):
""" takes a list of files, returns single concatenated file """
# concatenate into batchfile.txt:
with open(bulkfile, "w") as outfile:
for fname in matchlist:
try:
with open(fname, encoding="utf-8") as infile:
outfile.write(infile.read())
except UnicodeDecodeError:
with open(fname, encoding="latin-1") as infile:
outfile.write(infile.read())
# extract full text from a pdf:
def convert_pdf(path):
print(": converting pdf file...")
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, "rb")
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
except Exception as exc:
print(f": there was trouble: {exc}.\n: please enter a valid pdf")
sys.exit()
else:
print(": pdf converted.")
return text