add pdf option

This commit is contained in:
mousebot 2020-04-30 12:13:53 -03:00
parent e9fe8775ad
commit 1ed0760f7e
2 changed files with 66 additions and 9 deletions

View File

@ -5,6 +5,12 @@ import markovify
import sys
import html2text
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
fnf = ": error: file not found. please provide a path to a really-existing file!"
@ -73,8 +79,8 @@ def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
output = open(args_out, "a") # append
for i in range(args_sen):
output.write(
"\n" +
str(
"\n"
+ str(
tmodel.make_short_sentence(
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
)
@ -129,3 +135,43 @@ def dir_cat(matchlist, bulkfile):
except UnicodeDecodeError:
with open(fname, encoding="latin-1") as infile:
outfile.write(infile.read())
# extract full text from a pdf:
def convert_pdf_to_txt(path):
print("converting pdf file...")
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, "rb")
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
except Exception as exc:
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
sys.exit()
else:
print(": pdf converted.")
return text

View File

@ -35,6 +35,7 @@ from .functions import (
mkbnewline,
writesentence,
writeshortsentence,
convert_pdf,
)
# argparse
@ -54,10 +55,7 @@ def parse_the_args():
)
# optional args:
parser.add_argument(
"-u",
"--URL",
help="infile is a URL. all text it contains will be used.",
action="store_true",
"-u", "--URL", help="infile is a URL.", action="store_true",
)
parser.add_argument(
"-d",
@ -65,6 +63,12 @@ def parse_the_args():
help="infile is a directory. all text files in it and its subdirectories will be used.",
action="store_true",
)
parser.add_argument(
"-P",
"--pdf",
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
action="store_true",
)
parser.add_argument(
"-s",
"--state-size",
@ -88,7 +92,7 @@ def parse_the_args():
parser.add_argument(
"-o",
"--overlap",
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.",
type=float,
default=0.5,
)
@ -98,7 +102,7 @@ def parse_the_args():
help="provide an another text file to be combined with the first item.",
)
parser.add_argument(
"-C", "--combine-URL", help="provide a URL to be combined with the first item"
"-C", "--combine-URL", help="provide a URL to be combined with the first item."
)
parser.add_argument(
"-w",
@ -158,6 +162,9 @@ def main():
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
# infile is pdf:
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
@ -178,6 +185,8 @@ def main():
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
elif args.pdf:
text = convert_pdf(args.infile)
# or normal:
else:
text = read(args.infile)
@ -211,6 +220,8 @@ def main():
dir_cat(matchlist, batchfile)
text = read(batchfile)
os.unlink(batchfile)
elif args.pdf:
text = convert_pdf_to_txt(args.infile)
# or local file:
else:
text = read(args.infile)
@ -248,7 +259,7 @@ def main():
if args.combine or args.combine_URL:
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
outp.write("state size: " + str(vars(args)["state_size"]))
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
outp.write("\n")
# write it!