diff --git a/README.org b/README.org index 599a8a6..d09e61e 100755 --- a/README.org +++ b/README.org @@ -28,6 +28,7 @@ data using the R programming environment ** Web scraping * [[./script/scraping2.R][Web scraping example]] + * [[./script/web_scraping.R][Using 'rvest' for scraping]] ** Interesting packages diff --git a/script/web_scraping.R b/script/web_scraping.R new file mode 100755 index 0000000..7740988 --- /dev/null +++ b/script/web_scraping.R @@ -0,0 +1,71 @@ +#' --- +#' title: "Web scraping" +#' date: "2022-06-17" +#' author: "Jose https://ajuda.multifarm.top" +#' output: +#' html_document: +#' code_folding: show +#' toc: yes +#' toc_float: +#' smooth_scroll: true +#' df_print: paged +#' highlight: zenburn +#' --- + +#' remove objects + +rm(list = ls()) + +#' Libraries +library(rvest) + +#' Data + +url <- 'https://www-remessaonline-com-br.cdn.ampproject.org/c/s/www.remessaonline.com.br/blog/agtech/amp/' + +#'------------------------------------------------------------------------------ +#' +#' ## Scraping a url +#' +#' ------------------------------------------------------------------------------ + +### Headings + +u <- read_html(url) + +class(u) + +str(u) + +u[[2]] + +html_elements(u, "h1") + +html_elements(u, "h2") + +html_nodes(u, "h3") |> html_text2() |> data.table::data.table() + +html_elements(u, "h4") + +html_elements(u, "h5") + +html_children(u) + +nodes <- paste0('h', c(1:6)) + +?html_elements + +### Paragraphs + +pel <- html_elements(u, "p") |> html_text() + +peli <- as.list(pel) + +peli[[2]] + +### List elements + +html_nodes(u, "ul") |> html_text() + +html_nodes(u, "li") |> html_text() +