From 46246663fd6be83c5196a7fad83abae754ff4724 Mon Sep 17 00:00:00 2001
From: Jose <greenleaves@disroot.org>
Date: Thu, 27 Oct 2022 11:06:55 -0300
Subject: [PATCH] Extracting text from pdf

---
 script/extractpdf.R | 52 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100755 script/extractpdf.R

diff --git a/script/extractpdf.R b/script/extractpdf.R
new file mode 100755
index 0000000..45a4c13
--- /dev/null
+++ b/script/extractpdf.R
@@ -0,0 +1,52 @@
+#' ---
+#' title: "Using pdf tools in R to explore files in pdf format"
+#' date: "2022-06-24"
+#' author: "Jose https://ajuda.multifarm.top"
+#' output:
+#'   html_document:
+#'    code_folding: show
+#'    toc: yes
+#'    toc_float:
+#'      smooth_scroll: true
+#'    df_print: paged
+#'    highlight: zenburn
+#' ---
+
+#' remove objects
+
+rm(list = ls())
+
+#' Libraries
+
+library(pdftools)
+library(data.table)
+
+help(package = 'pdftools')
+
+?pdftools
+
+#'------------------------------------------------------------------------------
+#'
+#' ## Data
+#'
+#' ------------------------------------------------------------------------------
+
+#' Searching for a local file
+#'
+#' pdf = file.path('../local/', 'Report remote sensing.pdf')
+
+unlist(pdf_info(pdf))
+
+#' Reading the pdf
+
+rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf')
+
+str(rs)
+
+rs[50]
+
+rs[12]
+
+rs[13]
+
+#' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`