Web scraping

This commit is contained in:
Jose 2022-10-27 12:23:16 -03:00
parent 8f09dc9ad1
commit 68fbc33a3c
2 changed files with 123 additions and 0 deletions

View File

@ -24,3 +24,4 @@ data using the R programming environment
** Interesting packages
* [[./script/roadoi_package.R]['roadoi']]
* [[./script/scraping.R]['crul' package - web scraping]]

122
script/scraping.R Executable file
View File

@ -0,0 +1,122 @@
#' ---
#' title: "Web scraping"
#' date: "2022-06-19"
#' author: "Jose https://ajuda.multifarm.top"
#' output:
#' html_document:
#' code_folding: show
#' toc: yes
#' toc_float:
#' smooth_scroll: true
#' df_print: paged
#' highlight: zenburn
#' ---
#' remove objects
rm(list = ls())
#' Libraries
library(rvest)
library(crul)
help(package = 'crul')
vignette(package = 'crul')
vignette('crul', package = 'crul')
#' https://ropensci.org
#'
#' Testing crul
#'
(x <- HttpClient$new(
url = 'https://dairymgt.info/tools.php',
opts = list(
timeout = 1
),
headers = list(
a = "hello world"
)
))
class(x)
str(x)
x$get()
res <- x$get()
str(res)
## The response from a http request is another R6 class HttpResponse, which has
## slots for the outputs of the request, and some functions to deal with the response:
## Status code
res$status_code
head(res$content)
res$method
res$request
res$opts
#'------------------------------------------------------------------------------
#'
#' Data
#'
#' ------------------------------------------------------------------------------
url = 'https://dairymgt.info/tools.php'
a = read_html(url)
html_children(a)[1] |> html_text()
str(a)
html_elements(a, "h1")
html_elements(a, "h2")
html_elements(a, "h2") |> html_text()
#' html_elements(a, "div") |> html_text2()
html_elements(a, "p") |> html_text()
html_elements(a, "h3")
html_elements(a, "h4")
#'------------------------------------------------------------------------------
#'
#' ## Attributes
#'
#' ------------------------------------------------------------------------------
html_attrs(html_elements(a, 'h2'))
a |> html_elements("h2") |> html_attr("href")
a |> html_elements("h2") |> html_attr("class")
a |> html_elements("h2") |> html_attr("class")
html_attrs(html_elements(a, 'h3'))
?html_attrs
###############################################################
## page %>% ##
## html_nodes("a") %>% # find all links ##
## html_attr("href") %>% # get the url ##
## str_subset("\\.xlsx") %>% # find those that end in xlsx ##
## .[[1]] ##
###############################################################