add pdf option
This commit is contained in:
parent
e9fe8775ad
commit
1ed0760f7e
|
@ -5,6 +5,12 @@ import markovify
|
||||||
import sys
|
import sys
|
||||||
import html2text
|
import html2text
|
||||||
|
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
from pdfminer.converter import TextConverter
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
|
||||||
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
||||||
|
|
||||||
|
@ -73,8 +79,8 @@ def writeshortsentence(tmodel, args_sen, args_out, args_over, args_len):
|
||||||
output = open(args_out, "a") # append
|
output = open(args_out, "a") # append
|
||||||
for i in range(args_sen):
|
for i in range(args_sen):
|
||||||
output.write(
|
output.write(
|
||||||
"\n" +
|
"\n"
|
||||||
str(
|
+ str(
|
||||||
tmodel.make_short_sentence(
|
tmodel.make_short_sentence(
|
||||||
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
tries=2000, max_overlap_ratio=args_over, max_chars=args_len
|
||||||
)
|
)
|
||||||
|
@ -129,3 +135,43 @@ def dir_cat(matchlist, bulkfile):
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
with open(fname, encoding="latin-1") as infile:
|
with open(fname, encoding="latin-1") as infile:
|
||||||
outfile.write(infile.read())
|
outfile.write(infile.read())
|
||||||
|
|
||||||
|
|
||||||
|
# extract full text from a pdf:
|
||||||
|
def convert_pdf_to_txt(path):
|
||||||
|
print("converting pdf file...")
|
||||||
|
try:
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
retstr = StringIO()
|
||||||
|
codec = "utf-8"
|
||||||
|
laparams = LAParams()
|
||||||
|
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
|
||||||
|
fp = open(path, "rb")
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
password = ""
|
||||||
|
maxpages = 0
|
||||||
|
caching = True
|
||||||
|
pagenos = set()
|
||||||
|
|
||||||
|
for page in PDFPage.get_pages(
|
||||||
|
fp,
|
||||||
|
pagenos,
|
||||||
|
maxpages=maxpages,
|
||||||
|
password=password,
|
||||||
|
caching=caching,
|
||||||
|
check_extractable=True,
|
||||||
|
):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
|
||||||
|
text = retstr.getvalue()
|
||||||
|
|
||||||
|
fp.close()
|
||||||
|
device.close()
|
||||||
|
retstr.close()
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
print(f": There was a problem: {exc}.\n: Please enter a valid pdf")
|
||||||
|
sys.exit()
|
||||||
|
else:
|
||||||
|
print(": pdf converted.")
|
||||||
|
return text
|
||||||
|
|
|
@ -35,6 +35,7 @@ from .functions import (
|
||||||
mkbnewline,
|
mkbnewline,
|
||||||
writesentence,
|
writesentence,
|
||||||
writeshortsentence,
|
writeshortsentence,
|
||||||
|
convert_pdf,
|
||||||
)
|
)
|
||||||
|
|
||||||
# argparse
|
# argparse
|
||||||
|
@ -54,10 +55,7 @@ def parse_the_args():
|
||||||
)
|
)
|
||||||
# optional args:
|
# optional args:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-u",
|
"-u", "--URL", help="infile is a URL.", action="store_true",
|
||||||
"--URL",
|
|
||||||
help="infile is a URL. all text it contains will be used.",
|
|
||||||
action="store_true",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
|
@ -65,6 +63,12 @@ def parse_the_args():
|
||||||
help="infile is a directory. all text files in it and its subdirectories will be used.",
|
help="infile is a directory. all text files in it and its subdirectories will be used.",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-P",
|
||||||
|
"--pdf",
|
||||||
|
help="infile is a pdf. NB: for this to work you need to install pdfminer.",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-s",
|
"-s",
|
||||||
"--state-size",
|
"--state-size",
|
||||||
|
@ -88,7 +92,7 @@ def parse_the_args():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-o",
|
"-o",
|
||||||
"--overlap",
|
"--overlap",
|
||||||
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5",
|
help="the amount of overlap allowed between original and output, expressed as a ratio between 0 and 1. defaults to 0.5.",
|
||||||
type=float,
|
type=float,
|
||||||
default=0.5,
|
default=0.5,
|
||||||
)
|
)
|
||||||
|
@ -98,7 +102,7 @@ def parse_the_args():
|
||||||
help="provide an another text file to be combined with the first item.",
|
help="provide an another text file to be combined with the first item.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-C", "--combine-URL", help="provide a URL to be combined with the first item"
|
"-C", "--combine-URL", help="provide a URL to be combined with the first item."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-w",
|
"-w",
|
||||||
|
@ -158,6 +162,9 @@ def main():
|
||||||
dir_cat(matchlist, batchfile)
|
dir_cat(matchlist, batchfile)
|
||||||
text = read(batchfile)
|
text = read(batchfile)
|
||||||
os.unlink(batchfile)
|
os.unlink(batchfile)
|
||||||
|
# infile is pdf:
|
||||||
|
elif args.pdf:
|
||||||
|
text = convert_pdf(args.infile)
|
||||||
# or normal:
|
# or normal:
|
||||||
else:
|
else:
|
||||||
text = read(args.infile)
|
text = read(args.infile)
|
||||||
|
@ -178,6 +185,8 @@ def main():
|
||||||
dir_cat(matchlist, batchfile)
|
dir_cat(matchlist, batchfile)
|
||||||
text = read(batchfile)
|
text = read(batchfile)
|
||||||
os.unlink(batchfile)
|
os.unlink(batchfile)
|
||||||
|
elif args.pdf:
|
||||||
|
text = convert_pdf(args.infile)
|
||||||
# or normal:
|
# or normal:
|
||||||
else:
|
else:
|
||||||
text = read(args.infile)
|
text = read(args.infile)
|
||||||
|
@ -211,6 +220,8 @@ def main():
|
||||||
dir_cat(matchlist, batchfile)
|
dir_cat(matchlist, batchfile)
|
||||||
text = read(batchfile)
|
text = read(batchfile)
|
||||||
os.unlink(batchfile)
|
os.unlink(batchfile)
|
||||||
|
elif args.pdf:
|
||||||
|
text = convert_pdf_to_txt(args.infile)
|
||||||
# or local file:
|
# or local file:
|
||||||
else:
|
else:
|
||||||
text = read(args.infile)
|
text = read(args.infile)
|
||||||
|
@ -248,7 +259,7 @@ def main():
|
||||||
if args.combine or args.combine_URL:
|
if args.combine or args.combine_URL:
|
||||||
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
|
outp.write("weight: " + str(vars(args)["weight"]) + " | ")
|
||||||
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
|
outp.write("overlap: " + str(vars(args)["overlap"]) + " | ")
|
||||||
outp.write("state size: " + str(vars(args)["state_size"]))
|
outp.write("state size: " + str(vars(args)["state_size"]) + "\n")
|
||||||
outp.write("\n")
|
outp.write("\n")
|
||||||
|
|
||||||
# write it!
|
# write it!
|
||||||
|
|
Loading…
Reference in New Issue