add pdf option

This commit is contained in:
mousebot 2020-04-30 12:13:53 -03:00
parent e9fe8775ad
commit 1ed0760f7e
2 changed files with 66 additions and 9 deletions

View File

@ -5,6 +5,12 @@ import markovify
import sys import sys
import html2text import html2text
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
fnf = ": error: file not found. please provide a path to a really-existing file!" fnf = ": error: file not found. please provide a path to a really-existing file!"
@ -73,8 +79,8 @@ def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
output = open(args_out, "a") # append output = open(args_out, "a") # append
for i in range(args_sen): for i in range(args_sen):
output.write( output.write(
"\n" + "\n"
str( + str(
tmodel.make_short_sentence( tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len tries=2000, max_overlap_ratio=args_over, max_chars=args_len
) )
@ -129,3 +135,43 @@ def dir_cat(matchlist, bulkfile):
except UnicodeDecodeError: except UnicodeDecodeError:
with open(fname, encoding="latin-1") as infile: with open(fname, encoding="latin-1") as infile:
outfile.write(infile.read()) outfile.write(infile.read())
# extract full text from a pdf:
def convert_pdf_to_txt(path):
print("converting pdf file...")
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, "rb")
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
sys.exit()
else:
print(": pdf converted.")
return text

View File

@ -35,6 +35,7 @@ from .functions import (
mkbnewline, mkbnewline,
writesentence, writesentence,
writeshortsentence, writeshortsentence,
convert_pdf,
) )
# argparse # argparse
@ -54,10 +55,7 @@ def parse_the_args():
) )
# optional args: # optional args:
parser.add_argument( parser.add_argument(
"-u", "-u", "--URL", help="infile is a URL.", action="store_true",
"--URL",
help="infile is a URL. all text it contains will be used.",
action="store_true",
) )
parser.add_argument( parser.add_argument(
"-d", "-d",
@ -65,6 +63,12 @@ def parse_the_args():
help="infile is a directory. all text files in it and its subdirectories will be used.", help="infile is a directory. all text files in it and its subdirectories will be used.",
action="store_true", action="store_true",
) )
parser.add_argument(
"-P",
"--pdf",
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
action="store_true",
)
parser.add_argument( parser.add_argument(
"-s", "-s",
"--state-size", "--state-size",
@ -88,7 +92,7 @@ def parse_the_args():
parser.add_argument( parser.add_argument(
"-o", "-o",
"--overlap", "--overlap",
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5", help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.",
type=float, type=float,
default=0.5, default=0.5,
) )
@ -98,7 +102,7 @@ def parse_the_args():
help="provide an another text file to be combined with the first item.", help="provide an another text file to be combined with the first item.",
) )
parser.add_argument( parser.add_argument(
"-C", "--combine-URL", help="provide a URL to be combined with the first item" "-C", "--combine-URL", help="provide a URL to be combined with the first item."
) )
parser.add_argument( parser.add_argument(
"-w", "-w",
@ -158,6 +162,9 @@ def main():
dir_cat(matchlist, batchfile) dir_cat(matchlist, batchfile)
text = read(batchfile) text = read(batchfile)
os.unlink(batchfile) os.unlink(batchfile)
# infile is pdf:
elif args.pdf:
text = convert_pdf(args.infile)
# or normal: # or normal:
else: else:
text = read(args.infile) text = read(args.infile)
@ -178,6 +185,8 @@ def main():
dir_cat(matchlist, batchfile) dir_cat(matchlist, batchfile)
text = read(batchfile) text = read(batchfile)
os.unlink(batchfile) os.unlink(batchfile)
elif args.pdf:
text = convert_pdf(args.infile)
# or normal: # or normal:
else: else:
text = read(args.infile) text = read(args.infile)
@ -211,6 +220,8 @@ def main():
dir_cat(matchlist, batchfile) dir_cat(matchlist, batchfile)
text = read(batchfile) text = read(batchfile)
os.unlink(batchfile) os.unlink(batchfile)
elif args.pdf:
text = convert_pdf_to_txt(args.infile)
# or local file: # or local file:
else: else:
text = read(args.infile) text = read(args.infile)
@ -248,7 +259,7 @@ def main():
if args.combine or args.combine_URL: if args.combine or args.combine_URL:
outp.write("weight: " + str(vars(args)["weight"]) + " | ") outp.write("weight: " + str(vars(args)["weight"]) + " | ")
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ") outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
outp.write("state size: " + str(vars(args)["state_size"])) outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
outp.write("\n") outp.write("\n")
# write it! # write it!