add pdf option
This commit is contained in:
parent
e9fe8775ad
commit
1ed0760f7e
|
@ -5,6 +5,12 @@ import markovify
|
|||
import sys
|
||||
import html2text
|
||||
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from io import StringIO
|
||||
|
||||
|
||||
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
||||
|
||||
|
@ -73,8 +79,8 @@ def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
|
|||
output = open(args_out, "a") # append
|
||||
for i in range(args_sen):
|
||||
output.write(
|
||||
"\n" +
|
||||
str(
|
||||
"\n"
|
||||
+ str(
|
||||
tmodel.make_short_sentence(
|
||||
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
||||
)
|
||||
|
@ -129,3 +135,43 @@ def dir_cat(matchlist, bulkfile):
|
|||
except UnicodeDecodeError:
|
||||
with open(fname, encoding="latin-1") as infile:
|
||||
outfile.write(infile.read())
|
||||
|
||||
|
||||
# extract full text from a pdf:
|
||||
def convert_pdf_to_txt(path):
|
||||
print("converting pdf file...")
|
||||
try:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
retstr = StringIO()
|
||||
codec = "utf-8"
|
||||
laparams = LAParams()
|
||||
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
|
||||
fp = open(path, "rb")
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
password = ""
|
||||
maxpages = 0
|
||||
caching = True
|
||||
pagenos = set()
|
||||
|
||||
for page in PDFPage.get_pages(
|
||||
fp,
|
||||
pagenos,
|
||||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=caching,
|
||||
check_extractable=True,
|
||||
):
|
||||
interpreter.process_page(page)
|
||||
|
||||
text = retstr.getvalue()
|
||||
|
||||
fp.close()
|
||||
device.close()
|
||||
retstr.close()
|
||||
|
||||
except Exception as exc:
|
||||
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
|
||||
sys.exit()
|
||||
else:
|
||||
print(": pdf converted.")
|
||||
return text
|
||||
|
|
|
@ -35,6 +35,7 @@ from .functions import (
|
|||
mkbnewline,
|
||||
writesentence,
|
||||
writeshortsentence,
|
||||
convert_pdf,
|
||||
)
|
||||
|
||||
# argparse
|
||||
|
@ -54,10 +55,7 @@ def parse_the_args():
|
|||
)
|
||||
# optional args:
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--URL",
|
||||
help="infile is a URL. all text it contains will be used.",
|
||||
action="store_true",
|
||||
"-u", "--URL", help="infile is a URL.", action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
|
@ -65,6 +63,12 @@ def parse_the_args():
|
|||
help="infile is a directory. all text files in it and its subdirectories will be used.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-P",
|
||||
"--pdf",
|
||||
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--state-size",
|
||||
|
@ -88,7 +92,7 @@ def parse_the_args():
|
|||
parser.add_argument(
|
||||
"-o",
|
||||
"--overlap",
|
||||
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
|
||||
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.",
|
||||
type=float,
|
||||
default=0.5,
|
||||
)
|
||||
|
@ -98,7 +102,7 @@ def parse_the_args():
|
|||
help="provide an another text file to be combined with the first item.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item"
|
||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w",
|
||||
|
@ -158,6 +162,9 @@ def main():
|
|||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
# infile is pdf:
|
||||
elif args.pdf:
|
||||
text = convert_pdf(args.infile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -178,6 +185,8 @@ def main():
|
|||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
elif args.pdf:
|
||||
text = convert_pdf(args.infile)
|
||||
# or normal:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -211,6 +220,8 @@ def main():
|
|||
dir_cat(matchlist, batchfile)
|
||||
text = read(batchfile)
|
||||
os.unlink(batchfile)
|
||||
elif args.pdf:
|
||||
text = convert_pdf_to_txt(args.infile)
|
||||
# or local file:
|
||||
else:
|
||||
text = read(args.infile)
|
||||
|
@ -248,7 +259,7 @@ def main():
|
|||
if args.combine or args.combine_URL:
|
||||
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
|
||||
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
|
||||
outp.write("state size: " + str(vars(args)["state_size"]))
|
||||
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
|
||||
outp.write("\n")
|
||||
|
||||
# write it!
|
||||
|
|
Loading…
Reference in New Issue