#' --- #' title: "Using pdf tools in R to explore files in pdf format" #' date: "2022-06-24" #' author: "Jose https://ajuda.multifarm.top" #' output: #' html_document: #' code_folding: show #' toc: yes #' toc_float: #' smooth_scroll: true #' df_print: paged #' highlight: zenburn #' --- #' remove objects rm(list = ls()) #' Libraries library(pdftools) library(data.table) help(package = 'pdftools') ?pdftools #'------------------------------------------------------------------------------ #' #' ## Data #' #' ------------------------------------------------------------------------------ #' Searching for a local file #' #' pdf = file.path('../local/', 'Report remote sensing.pdf') unlist(pdf_info(pdf)) #' Reading the pdf rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf') str(rs) rs[50] rs[12] rs[13] #' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`