intro_r/script/extractpdf.R

53 lines
1019 B
R
Executable File

#' ---
#' title: "Using pdf tools in R to explore files in pdf format"
#' date: "2022-06-24"
#' author: "Jose https://ajuda.multifarm.top"
#' output:
#' html_document:
#' code_folding: show
#' toc: yes
#' toc_float:
#' smooth_scroll: true
#' df_print: paged
#' highlight: zenburn
#' ---
#' remove objects
rm(list = ls())
#' Libraries
library(pdftools)
library(data.table)
help(package = 'pdftools')
?pdftools
#'------------------------------------------------------------------------------
#'
#' ## Data
#'
#' ------------------------------------------------------------------------------
#' Searching for a local file
#'
#' pdf = file.path('../local/', 'Report remote sensing.pdf')
unlist(pdf_info(pdf))
#' Reading the pdf
rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf')
str(rs)
rs[50]
rs[12]
rs[13]
#' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`