#' --- #' title: "Web scraping" #' date: "2022-06-19" #' author: "Jose https://ajuda.multifarm.top" #' output: #' html_document: #' code_folding: show #' toc: yes #' toc_float: #' smooth_scroll: true #' df_print: paged #' highlight: zenburn #' --- #' remove objects rm(list = ls()) #' Libraries library(rvest) library(crul) help(package = 'crul') vignette(package = 'crul') vignette('crul', package = 'crul') #' https://ropensci.org #' #' Testing crul #' (x <- HttpClient$new( url = 'https://dairymgt.info/tools.php', opts = list( timeout = 1 ), headers = list( a = "hello world" ) )) class(x) str(x) x$get() res <- x$get() str(res) ## The response from a http request is another R6 class HttpResponse, which has ## slots for the outputs of the request, and some functions to deal with the response: ## Status code res$status_code head(res$content) res$method res$request res$opts #'------------------------------------------------------------------------------ #' #' Data #' #' ------------------------------------------------------------------------------ url = 'https://dairymgt.info/tools.php' a = read_html(url) html_children(a)[1] |> html_text() str(a) html_elements(a, "h1") html_elements(a, "h2") html_elements(a, "h2") |> html_text() #' html_elements(a, "div") |> html_text2() html_elements(a, "p") |> html_text() html_elements(a, "h3") html_elements(a, "h4") #'------------------------------------------------------------------------------ #' #' ## Attributes #' #' ------------------------------------------------------------------------------ html_attrs(html_elements(a, 'h2')) a |> html_elements("h2") |> html_attr("href") a |> html_elements("h2") |> html_attr("class") a |> html_elements("h2") |> html_attr("class") html_attrs(html_elements(a, 'h3')) ?html_attrs ############################################################### ## page %>% ## ## html_nodes("a") %>% # find all links ## ## html_attr("href") %>% # get the url ## ## str_subset("\\.xlsx") %>% # find those that end in xlsx ## ## .[[1]] ## ###############################################################