From 1ed0760f7e1bc1adb7b7b1deb4d5dec7cda53ac1 Mon Sep 17 00:00:00 2001 From: mousebot Date: Thu, 30 Apr 2020 12:13:53 -0300 Subject: [PATCH] add pdf option --- mkv_this/functions.py | 50 +++++++++++++++++++++++++++++++++++++++++-- mkv_this/mkv_this.py | 25 ++++++++++++++++------ 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/mkv_this/functions.py b/mkv_this/functions.py index a3d4eb5..7116623 100644 --- a/mkv_this/functions.py +++ b/mkv_this/functions.py @@ -5,6 +5,12 @@ import markovify import sys import html2text +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.converter import TextConverter +from pdfminer.layout import LAParams +from pdfminer.pdfpage import PDFPage +from io import StringIO + fnf = ": error: file not found. please provide a path to a really-existing file!" @@ -73,8 +79,8 @@ def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len): output = open(args_out, "a") # append for i in range(args_sen): output.write( - "\n" + - str( + "\n" + + str( tmodel.make_short_sentence( tries=2000, max_overlap_ratio=args_over, max_chars=args_len ) @@ -129,3 +135,43 @@ def dir_cat(matchlist, bulkfile): except UnicodeDecodeError: with open(fname, encoding="latin-1") as infile: outfile.write(infile.read()) + + +# extract full text from a pdf: +def convert_pdf_to_txt(path): + print("converting pdf file...") + try: + rsrcmgr = PDFResourceManager() + retstr = StringIO() + codec = "utf-8" + laparams = LAParams() + device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) + fp = open(path, "rb") + interpreter = PDFPageInterpreter(rsrcmgr, device) + password = "" + maxpages = 0 + caching = True + pagenos = set() + + for page in PDFPage.get_pages( + fp, + pagenos, + maxpages=maxpages, + password=password, + caching=caching, + check_extractable=True, + ): + interpreter.process_page(page) + + text = retstr.getvalue() + + fp.close() + device.close() + retstr.close() + + except Exception as exc: + print(f": There was a problem: {exc}.\n: Please enter a valid pdf") + sys.exit() + else: + print(": pdf converted.") + return text diff --git a/mkv_this/mkv_this.py b/mkv_this/mkv_this.py index c0ec068..68d27ac 100755 --- a/mkv_this/mkv_this.py +++ b/mkv_this/mkv_this.py @@ -35,6 +35,7 @@ from .functions import ( mkbnewline, writesentence, writeshortsentence, + convert_pdf, ) # argparse @@ -54,10 +55,7 @@ def parse_the_args(): ) # optional args: parser.add_argument( - "-u", - "--URL", - help="infile is a URL. all text it contains will be used.", - action="store_true", + "-u", "--URL", help="infile is a URL.", action="store_true", ) parser.add_argument( "-d", @@ -65,6 +63,12 @@ def parse_the_args(): help="infile is a directory. all text files in it and its subdirectories will be used.", action="store_true", ) + parser.add_argument( + "-P", + "--pdf", + help="infile is a pdf. NB: for this to work you need to install pdfminer.", + action="store_true", + ) parser.add_argument( "-s", "--state-size", @@ -88,7 +92,7 @@ def parse_the_args(): parser.add_argument( "-o", "--overlap", - help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", + help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.", type=float, default=0.5, ) @@ -98,7 +102,7 @@ def parse_the_args(): help="provide an another text file to be combined with the first item.", ) parser.add_argument( - "-C", "--combine-URL", help="provide a URL to be combined with the first item" + "-C", "--combine-URL", help="provide a URL to be combined with the first item." ) parser.add_argument( "-w", @@ -158,6 +162,9 @@ def main(): dir_cat(matchlist, batchfile) text = read(batchfile) os.unlink(batchfile) + # infile is pdf: + elif args.pdf: + text = convert_pdf(args.infile) # or normal: else: text = read(args.infile) @@ -178,6 +185,8 @@ def main(): dir_cat(matchlist, batchfile) text = read(batchfile) os.unlink(batchfile) + elif args.pdf: + text = convert_pdf(args.infile) # or normal: else: text = read(args.infile) @@ -211,6 +220,8 @@ def main(): dir_cat(matchlist, batchfile) text = read(batchfile) os.unlink(batchfile) + elif args.pdf: + text = convert_pdf_to_txt(args.infile) # or local file: else: text = read(args.infile) @@ -248,7 +259,7 @@ def main(): if args.combine or args.combine_URL: outp.write("weight: " + str(vars(args)["weight"]) + " | ") outp.write("overlap: " + str(vars(args)["overlap"]) + " | ") - outp.write("state size: " + str(vars(args)["state_size"])) + outp.write("state size: " + str(vars(args)["state_size"]) + "\n") outp.write("\n") # write it!