Extracting text from pdf
This commit is contained in:
parent
50e1dc9340
commit
46246663fd
|
@ -0,0 +1,52 @@
|
|||
#' ---
|
||||
#' title: "Using pdf tools in R to explore files in pdf format"
|
||||
#' date: "2022-06-24"
|
||||
#' author: "Jose https://ajuda.multifarm.top"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' code_folding: show
|
||||
#' toc: yes
|
||||
#' toc_float:
|
||||
#' smooth_scroll: true
|
||||
#' df_print: paged
|
||||
#' highlight: zenburn
|
||||
#' ---
|
||||
|
||||
#' remove objects
|
||||
|
||||
rm(list = ls())
|
||||
|
||||
#' Libraries
|
||||
|
||||
library(pdftools)
|
||||
library(data.table)
|
||||
|
||||
help(package = 'pdftools')
|
||||
|
||||
?pdftools
|
||||
|
||||
#'------------------------------------------------------------------------------
|
||||
#'
|
||||
#' ## Data
|
||||
#'
|
||||
#' ------------------------------------------------------------------------------
|
||||
|
||||
#' Searching for a local file
|
||||
#'
|
||||
#' pdf = file.path('../local/', 'Report remote sensing.pdf')
|
||||
|
||||
unlist(pdf_info(pdf))
|
||||
|
||||
#' Reading the pdf
|
||||
|
||||
rs <- pdf_text('http://the-little-prince.site/appendix/The_Little_Prince.pdf')
|
||||
|
||||
str(rs)
|
||||
|
||||
rs[50]
|
||||
|
||||
rs[12]
|
||||
|
||||
rs[13]
|
||||
|
||||
#' pdftools::pdf_ocr_text(rs[13]) #' Needs 'tesseract' package for R and installed on your system. In Parabola GNU-linux, use: `pacman -S tesseract`
|
Loading…
Reference in New Issue