Web scraping
This commit is contained in:
parent
8f09dc9ad1
commit
68fbc33a3c
|
@ -24,3 +24,4 @@ data using the R programming environment
|
|||
** Interesting packages
|
||||
|
||||
* [[./script/roadoi_package.R]['roadoi']]
|
||||
* [[./script/scraping.R]['crul' package - web scraping]]
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
#' ---
|
||||
#' title: "Web scraping"
|
||||
#' date: "2022-06-19"
|
||||
#' author: "Jose https://ajuda.multifarm.top"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' code_folding: show
|
||||
#' toc: yes
|
||||
#' toc_float:
|
||||
#' smooth_scroll: true
|
||||
#' df_print: paged
|
||||
#' highlight: zenburn
|
||||
#' ---
|
||||
|
||||
#' remove objects
|
||||
|
||||
rm(list = ls())
|
||||
|
||||
#' Libraries
|
||||
library(rvest)
|
||||
library(crul)
|
||||
|
||||
help(package = 'crul')
|
||||
|
||||
vignette(package = 'crul')
|
||||
|
||||
vignette('crul', package = 'crul')
|
||||
|
||||
#' https://ropensci.org
|
||||
#'
|
||||
#' Testing crul
|
||||
#'
|
||||
|
||||
(x <- HttpClient$new(
|
||||
url = 'https://dairymgt.info/tools.php',
|
||||
opts = list(
|
||||
timeout = 1
|
||||
),
|
||||
headers = list(
|
||||
a = "hello world"
|
||||
)
|
||||
))
|
||||
|
||||
class(x)
|
||||
|
||||
str(x)
|
||||
|
||||
x$get()
|
||||
|
||||
res <- x$get()
|
||||
|
||||
str(res)
|
||||
|
||||
## The response from a http request is another R6 class HttpResponse, which has
|
||||
## slots for the outputs of the request, and some functions to deal with the response:
|
||||
|
||||
## Status code
|
||||
|
||||
res$status_code
|
||||
|
||||
head(res$content)
|
||||
|
||||
res$method
|
||||
|
||||
res$request
|
||||
|
||||
res$opts
|
||||
|
||||
|
||||
#'------------------------------------------------------------------------------
|
||||
#'
|
||||
#' Data
|
||||
#'
|
||||
#' ------------------------------------------------------------------------------
|
||||
|
||||
url = 'https://dairymgt.info/tools.php'
|
||||
|
||||
a = read_html(url)
|
||||
|
||||
html_children(a)[1] |> html_text()
|
||||
|
||||
str(a)
|
||||
|
||||
html_elements(a, "h1")
|
||||
|
||||
html_elements(a, "h2")
|
||||
|
||||
html_elements(a, "h2") |> html_text()
|
||||
|
||||
#' html_elements(a, "div") |> html_text2()
|
||||
|
||||
html_elements(a, "p") |> html_text()
|
||||
|
||||
html_elements(a, "h3")
|
||||
|
||||
html_elements(a, "h4")
|
||||
|
||||
#'------------------------------------------------------------------------------
|
||||
#'
|
||||
#' ## Attributes
|
||||
#'
|
||||
#' ------------------------------------------------------------------------------
|
||||
|
||||
html_attrs(html_elements(a, 'h2'))
|
||||
|
||||
a |> html_elements("h2") |> html_attr("href")
|
||||
|
||||
a |> html_elements("h2") |> html_attr("class")
|
||||
|
||||
a |> html_elements("h2") |> html_attr("class")
|
||||
|
||||
html_attrs(html_elements(a, 'h3'))
|
||||
|
||||
?html_attrs
|
||||
|
||||
###############################################################
|
||||
## page %>% ##
|
||||
## html_nodes("a") %>% # find all links ##
|
||||
## html_attr("href") %>% # get the url ##
|
||||
## str_subset("\\.xlsx") %>% # find those that end in xlsx ##
|
||||
## .[[1]] ##
|
||||
###############################################################
|
Loading…
Reference in New Issue