From 68fbc33a3c7fcf0ae9422943a42972a787149520 Mon Sep 17 00:00:00 2001 From: Jose Date: Thu, 27 Oct 2022 12:23:16 -0300 Subject: [PATCH] Web scraping --- README.org | 1 + script/scraping.R | 122 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100755 script/scraping.R diff --git a/README.org b/README.org index 1d2359e..8363d39 100755 --- a/README.org +++ b/README.org @@ -24,3 +24,4 @@ data using the R programming environment ** Interesting packages * [[./script/roadoi_package.R]['roadoi']] + * [[./script/scraping.R]['crul' package - web scraping]] diff --git a/script/scraping.R b/script/scraping.R new file mode 100755 index 0000000..bc3c1c7 --- /dev/null +++ b/script/scraping.R @@ -0,0 +1,122 @@ +#' --- +#' title: "Web scraping" +#' date: "2022-06-19" +#' author: "Jose https://ajuda.multifarm.top" +#' output: +#' html_document: +#' code_folding: show +#' toc: yes +#' toc_float: +#' smooth_scroll: true +#' df_print: paged +#' highlight: zenburn +#' --- + +#' remove objects + +rm(list = ls()) + +#' Libraries +library(rvest) +library(crul) + +help(package = 'crul') + +vignette(package = 'crul') + +vignette('crul', package = 'crul') + +#' https://ropensci.org +#' +#' Testing crul +#' + +(x <- HttpClient$new( + url = 'https://dairymgt.info/tools.php', + opts = list( + timeout = 1 + ), + headers = list( + a = "hello world" + ) + )) + +class(x) + +str(x) + +x$get() + +res <- x$get() + +str(res) + +## The response from a http request is another R6 class HttpResponse, which has +## slots for the outputs of the request, and some functions to deal with the response: + +## Status code + +res$status_code + +head(res$content) + +res$method + +res$request + +res$opts + + +#'------------------------------------------------------------------------------ +#' +#' Data +#' +#' ------------------------------------------------------------------------------ + +url = 'https://dairymgt.info/tools.php' + +a = read_html(url) + +html_children(a)[1] |> html_text() + +str(a) + +html_elements(a, "h1") + +html_elements(a, "h2") + +html_elements(a, "h2") |> html_text() + +#' html_elements(a, "div") |> html_text2() + +html_elements(a, "p") |> html_text() + +html_elements(a, "h3") + +html_elements(a, "h4") + +#'------------------------------------------------------------------------------ +#' +#' ## Attributes +#' +#' ------------------------------------------------------------------------------ + +html_attrs(html_elements(a, 'h2')) + +a |> html_elements("h2") |> html_attr("href") + +a |> html_elements("h2") |> html_attr("class") + +a |> html_elements("h2") |> html_attr("class") + +html_attrs(html_elements(a, 'h3')) + +?html_attrs + +############################################################### +## page %>% ## +## html_nodes("a") %>% # find all links ## +## html_attr("href") %>% # get the url ## +## str_subset("\\.xlsx") %>% # find those that end in xlsx ## +## .[[1]] ## +###############################################################