mkv-this/mkv_this/functions.py

import os
import re
import requests
import markovify
import sys
import html2text


fnf = ": error: file not found. please provide a path to a really-existing file!"


def url(insert):
    """ fetch a webpage, return it as html """
    try:
        req = requests.get(insert)
        req.raise_for_status()
        req.encoding = req.apparent_encoding
        # use chardet to catch encoding issue with ISO-8859-1/Latin-1.
    except Exception as exc:
        print(f": there was trouble: {exc}.\n: please enter a valid url.")
        sys.exit()
    else:
        print(": fetched html from url.")
        return req.text


def convert_html(html):
    """ convert a html page to plain text """
    h2t = html2text.HTML2Text()
    h2t.images_to_alt = True
    h2t.ignore_links = True
    h2t.ignore_emphasis = True
    h2t.ignore_tables = True
    h2t.unicode_snob = False
    h2t.decode_errors = "replace"
    h2t.escape_all = False  # remove all noise if needed
    s = h2t.handle(html)
    s = re.sub("[#*]", "", s)  # remove hashes and stars from the 'markdown'
    print(": html converted to plain text")
    return s


def read(infile):
    """ read a file so its ready for markov """
    try:
        if infile.lower().endswith(".pdf"):
            print(
                "looks like you entered a pdf file. you need to use the '-P' flag to convert it."
            )
            sys.exit()
        else:
            with open(infile, encoding="utf-8") as f:
                return f.read()
    except UnicodeDecodeError:
        with open(infile, encoding="latin-1") as f:
            return f.read()
    except IsADirectoryError as exc:
        print(
            f": there was trouble: {exc}.\n: looks like you entered a directory. use '-d' for that."
        )
        sys.exit()
    except FileNotFoundError:
        print(fnf)
        sys.exit()


def mkbtext(texttype, args_ss, args_wf):
    """ build a markov model """
    return markovify.Text(texttype, state_size=args_ss, well_formed=args_wf)


def mkbnewline(texttype, args_ss, args_wf):
    """ build a markov model, newline """
    return markovify.NewlineText(texttype, state_size=args_ss, well_formed=args_wf)


def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
    """ actually make the damn litter-atchya, appended to outfile, short sentence """
    output = open(args_out, "a")  # append
    for i in range(args_sen):
        output.write(
            "\n"
            + str(
                tmodel.make_short_sentence(
                    tries=2000, max_overlap_ratio=args_over, max_chars=args_len
                )
            )
            + "\n\n"
        )
    output.write(str("*\n\n"))
    output.close()


def writesentence(tmodel, args_sen, args_out, args_over, args_len):
    """ actually make the damn litter-atchya, appendended to outfile """
    output = open(args_out, "a")  # append
    for i in range(args_sen):
        output.write(
            str(
                tmodel.make_sentence(
                    tries=2000, max_overlap_ratio=args_over, max_chars=args_len
                )
            )
            + "\n\n"
        )
    output.write(str("*\n\n"))
    output.close()


# functions for args.directory:
def dir_list(directory):
    """ returns a list of all text files from a directory"""
    # create a list of files to concatenate:
    matches = []
    if os.path.isdir(directory) is True:
        for root, dirnames, filenames in os.walk(directory):
            for filename in filenames:
                if filename.endswith((".txt", ".org", ".md")):
                    matches.append(os.path.join(root, filename))
        print(": text files fetched and combined")
    else:
        print(": error: please enter a valid directory")
        sys.exit()
    return matches


def dir_cat(matchlist, bulkfile):
    """ takes a list of files, returns single concatenated file """
    # concatenate into batchfile.txt:
    with open(bulkfile, "w") as outfile:
        for fname in matchlist:
            try:
                with open(fname, encoding="utf-8") as infile:
                    outfile.write(infile.read())
            except UnicodeDecodeError:
                with open(fname, encoding="latin-1") as infile:
                    outfile.write(infile.read())


# extract full text from a pdf:
def convert_pdf(path):
    try:
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        from pdfminer.converter import TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfpage import PDFPage
        from io import StringIO
    except ModuleNotFoundError as exc:
        print(
            f": there was trouble: {exc}.\n: install 'pdfminer.six' with pip to convert a pdf."
        )
        sys.exit()
    print(": converting pdf file...")
    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = "utf-8"
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = open(path, "rb")
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        ):
            interpreter.process_page(page)

        text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()

    except Exception as exc:
        print(f": there was trouble: {exc}.\n: please enter a valid pdf")
        sys.exit()
    else:
        print(": pdf converted.")
        return text