Extracting text from pdf

2022-10-27 11:06:55 -03:00 · 2022-10-27 11:06:55 -03:00 · 46246663fd
parent 50e1dc9340
commit 46246663fd
1 changed files with 52 additions and 0 deletions
--- a/script/extractpdf.R
+++ b/script/extractpdf.R
@ -0,0 +1,52 @@
+#' ---
+#' title: "Using pdf tools in R to explore files in pdf format"
+#' date: "2022-06-24"
+#' author: "Jose https://ajuda.multifarm.top"
+#' output:
+#'   html_document:
+#'    code_folding: show
+#'    toc: yes
+#'    toc_float:
+#'      smooth_scroll: true
+#'    df_print: paged
+#'    highlight: zenburn
+#' ---
+
+#' remove objects
+
+rm(list = ls())
+
+#' Libraries
+
+library(pdftools)
+library(data.table)
+
+help(package = 'pdftools')
+
+?pdftools
+
+#'------------------------------------------------------------------------------
+#'
+#' ## Data
+#'
+#' ------------------------------------------------------------------------------
+
+#' Searching for a local file
+#'
+#' pdf = file.path('../local/', 'Report remote sensing.pdf')
+
+unlist(pdf_info(pdf))
+
+#' Reading the pdf
+
+rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf')
+
+str(rs)
+
+rs[50]
+
+rs[12]
+
+rs[13]
+
+#' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`