53 lines
1019 B
R
Executable File
53 lines
1019 B
R
Executable File
#' ---
|
|
#' title: "Using pdf tools in R to explore files in pdf format"
|
|
#' date: "2022-06-24"
|
|
#' author: "Jose https://ajuda.multifarm.top"
|
|
#' output:
|
|
#' html_document:
|
|
#' code_folding: show
|
|
#' toc: yes
|
|
#' toc_float:
|
|
#' smooth_scroll: true
|
|
#' df_print: paged
|
|
#' highlight: zenburn
|
|
#' ---
|
|
|
|
#' remove objects
|
|
|
|
rm(list = ls())
|
|
|
|
#' Libraries
|
|
|
|
library(pdftools)
|
|
library(data.table)
|
|
|
|
help(package = 'pdftools')
|
|
|
|
?pdftools
|
|
|
|
#'------------------------------------------------------------------------------
|
|
#'
|
|
#' ## Data
|
|
#'
|
|
#' ------------------------------------------------------------------------------
|
|
|
|
#' Searching for a local file
|
|
#'
|
|
#' pdf = file.path('../local/', 'Report remote sensing.pdf')
|
|
|
|
unlist(pdf_info(pdf))
|
|
|
|
#' Reading the pdf
|
|
|
|
rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf')
|
|
|
|
str(rs)
|
|
|
|
rs[50]
|
|
|
|
rs[12]
|
|
|
|
rs[13]
|
|
|
|
#' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`
|