Web scraping script

This commit is contained in:
Jose 2022-10-27 12:59:46 -03:00
parent 76873af5a9
commit e86c8110d7
2 changed files with 72 additions and 0 deletions

View File

@ -28,6 +28,7 @@ data using the R programming environment
** Web scraping ** Web scraping
* [[./script/scraping2.R][Web scraping example]] * [[./script/scraping2.R][Web scraping example]]
* [[./script/web_scraping.R][Using 'rvest' for scraping]]
** Interesting packages ** Interesting packages

71
script/web_scraping.R Executable file
View File

@ -0,0 +1,71 @@
#' ---
#' title: "Web scraping"
#' date: "2022-06-17"
#' author: "Jose https://ajuda.multifarm.top"
#' output:
#' html_document:
#' code_folding: show
#' toc: yes
#' toc_float:
#' smooth_scroll: true
#' df_print: paged
#' highlight: zenburn
#' ---
#' remove objects
rm(list = ls())
#' Libraries
library(rvest)
#' Data
url <- 'https://www-remessaonline-com-br.cdn.ampproject.org/c/s/www.remessaonline.com.br/blog/agtech/amp/'
#'------------------------------------------------------------------------------
#'
#' ## Scraping a url
#'
#' ------------------------------------------------------------------------------
### Headings
u <- read_html(url)
class(u)
str(u)
u[[2]]
html_elements(u, "h1")
html_elements(u, "h2")
html_nodes(u, "h3") |> html_text2() |> data.table::data.table()
html_elements(u, "h4")
html_elements(u, "h5")
html_children(u)
nodes <- paste0('h', c(1:6))
?html_elements
### Paragraphs
pel <- html_elements(u, "p") |> html_text()
peli <- as.list(pel)
peli[[2]]
### List elements
html_nodes(u, "ul") |> html_text()
html_nodes(u, "li") |> html_text()