|
|
|
@ -5,11 +5,6 @@ import markovify
|
|
|
|
|
import sys
|
|
|
|
|
import html2text
|
|
|
|
|
|
|
|
|
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
|
|
from pdfminer.converter import TextConverter
|
|
|
|
|
from pdfminer.layout import LAParams
|
|
|
|
|
from pdfminer.pdfpage import PDFPage
|
|
|
|
|
from io import StringIO
|
|
|
|
|
|
|
|
|
|
fnf = ": error: file not found. please provide a path to a really-existing file!"
|
|
|
|
|
|
|
|
|
@ -144,6 +139,17 @@ def dir_cat(matchlist, bulkfile):
|
|
|
|
|
|
|
|
|
|
# extract full text from a pdf:
|
|
|
|
|
def convert_pdf(path):
|
|
|
|
|
try:
|
|
|
|
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
|
|
from pdfminer.converter import TextConverter
|
|
|
|
|
from pdfminer.layout import LAParams
|
|
|
|
|
from pdfminer.pdfpage import PDFPage
|
|
|
|
|
from io import StringIO
|
|
|
|
|
except ModuleNotFoundError as exc:
|
|
|
|
|
print(
|
|
|
|
|
f": there was trouble: {exc}.\n: install 'pdfminer.six' with pip to convert a pdf."
|
|
|
|
|
)
|
|
|
|
|
sys.exit()
|
|
|
|
|
print(": converting pdf file...")
|
|
|
|
|
try:
|
|
|
|
|
rsrcmgr = PDFResourceManager()
|
|
|
|
|