From 46246663fd6be83c5196a7fad83abae754ff4724 Mon Sep 17 00:00:00 2001 From: Jose Date: Thu, 27 Oct 2022 11:06:55 -0300 Subject: [PATCH] Extracting text from pdf --- script/extractpdf.R | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100755 script/extractpdf.R diff --git a/script/extractpdf.R b/script/extractpdf.R new file mode 100755 index 0000000..45a4c13 --- /dev/null +++ b/script/extractpdf.R @@ -0,0 +1,52 @@ +#' --- +#' title: "Using pdf tools in R to explore files in pdf format" +#' date: "2022-06-24" +#' author: "Jose https://ajuda.multifarm.top" +#' output: +#' html_document: +#' code_folding: show +#' toc: yes +#' toc_float: +#' smooth_scroll: true +#' df_print: paged +#' highlight: zenburn +#' --- + +#' remove objects + +rm(list = ls()) + +#' Libraries + +library(pdftools) +library(data.table) + +help(package = 'pdftools') + +?pdftools + +#'------------------------------------------------------------------------------ +#' +#' ## Data +#' +#' ------------------------------------------------------------------------------ + +#' Searching for a local file +#' +#' pdf = file.path('../local/', 'Report remote sensing.pdf') + +unlist(pdf_info(pdf)) + +#' Reading the pdf + +rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf') + +str(rs) + +rs[50] + +rs[12] + +rs[13] + +#' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`