72 lines
1.1 KiB
R
Executable File
72 lines
1.1 KiB
R
Executable File
#' ---
|
|
#' title: "Web scraping"
|
|
#' date: "2022-06-17"
|
|
#' author: "Jose https://ajuda.multifarm.top"
|
|
#' output:
|
|
#' html_document:
|
|
#' code_folding: show
|
|
#' toc: yes
|
|
#' toc_float:
|
|
#' smooth_scroll: true
|
|
#' df_print: paged
|
|
#' highlight: zenburn
|
|
#' ---
|
|
|
|
#' remove objects
|
|
|
|
rm(list = ls())
|
|
|
|
#' Libraries
|
|
library(rvest)
|
|
|
|
#' Data
|
|
|
|
url <- 'https://www-remessaonline-com-br.cdn.ampproject.org/c/s/www.remessaonline.com.br/blog/agtech/amp/'
|
|
|
|
#'------------------------------------------------------------------------------
|
|
#'
|
|
#' ## Scraping a url
|
|
#'
|
|
#' ------------------------------------------------------------------------------
|
|
|
|
### Headings
|
|
|
|
u <- read_html(url)
|
|
|
|
class(u)
|
|
|
|
str(u)
|
|
|
|
u[[2]]
|
|
|
|
html_elements(u, "h1")
|
|
|
|
html_elements(u, "h2")
|
|
|
|
html_nodes(u, "h3") |> html_text2() |> data.table::data.table()
|
|
|
|
html_elements(u, "h4")
|
|
|
|
html_elements(u, "h5")
|
|
|
|
html_children(u)
|
|
|
|
nodes <- paste0('h', c(1:6))
|
|
|
|
?html_elements
|
|
|
|
### Paragraphs
|
|
|
|
pel <- html_elements(u, "p") |> html_text()
|
|
|
|
peli <- as.list(pel)
|
|
|
|
peli[[2]]
|
|
|
|
### List elements
|
|
|
|
html_nodes(u, "ul") |> html_text()
|
|
|
|
html_nodes(u, "li") |> html_text()
|
|
|