diff --git a/README.org b/README.org index 9e7b1dd..299a7dd 100755 --- a/README.org +++ b/README.org @@ -40,3 +40,9 @@ data using the R programming environment * [[./script/roadoi_package.R]['roadoi']] * [[./script/scraping.R]['crul' package - web scraping]] + +** Data wrangling with {data.table} + + * [[./doc/datatable_intro.org][Introduction to {data.table}]] + * [[./doc/datatable_querying.org][Querying in {data.table}: filter in ~i~ ]] + * [[./doc/datatable_querying_j.org][Querying in {data.table}: select in ~j~ ]] diff --git a/doc/datatable_intro.org b/doc/datatable_intro.org new file mode 100644 index 0000000..52448c2 --- /dev/null +++ b/doc/datatable_intro.org @@ -0,0 +1,90 @@ +#+TITLE: Datatable package +#+DATE: 2021-10-26 +#+OPTIONS: creator:nil timestamp:nil todo:nil num:nil +#+PROPERTY: header-args:R :results output :session *Rc* :cmdline :tangle yes +#+PROPERTY: header-args:R+ :exports both +#+SETUPFILE: https://fniessen.github.io/org-html-themes/org/theme-readtheorg.setup +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: + + +#+begin_export html +
+

+ Org document with + R code +

+

+ + + +

+
+#+end_export + +#+begin_src R :exports code + rm(list = ls()) +#+end_src + +** Introduction + +#+begin_src R + library(data.table) +#+end_src + + +Reading data into R using ~{data.table}~ + +#+begin_src R :exports code + url = "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv" + flights <- fread(url) +#+end_src + +#+RESULTS: + +#+begin_src R + names(flights) +#+end_src + +#+begin_src R + flights +#+end_src + +#+begin_src R + nrow(flights) +#+end_src + +#+RESULTS: +: [1] 253316 + +Creating a data.table: + +#+begin_src R + dt <- data.table(col1 = c(1:6), + col2 = LETTERS[1:6], + col3 = (1)) +#+end_src + + +#+begin_src R + class(dt) +#+end_src + +#+begin_src R + class(dt$col2) +#+end_src + +** Some references to learn more about {data.table} + + * https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html + * See vignettes in R: + : > vignette(package="data.table") + * https://atrebas.github.io/post/2019-03-03-datatable-dplyr/ + * https://rdatatable.gitlab.io/data.table/ + * https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf + * https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly + * ~dtplyr~ package: + * https://github.com/tidyverse/dtplyr diff --git a/doc/datatable_querying.org b/doc/datatable_querying.org new file mode 100644 index 0000000..008d366 --- /dev/null +++ b/doc/datatable_querying.org @@ -0,0 +1,67 @@ +#+TITLE: Datatable package +#+DATE: 2021-10-26 +#+OPTIONS: creator:nil timestamp:nil todo:nil num:nil +#+PROPERTY: header-args:R :results output :session *Rc* :cmdline :tangle yes +#+PROPERTY: header-args:R+ :exports both +#+SETUPFILE: https://fniessen.github.io/org-html-themes/org/theme-readtheorg.setup +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: + + +#+begin_export html +
+

+ Org document with + R code +

+

+ + + +

+
+#+end_export + +#+begin_src R :exports code + rm(list = ls()) +#+end_src + + +** Querying data.table + +Sintaxis: + + * ~DT[i, j, by]~ + +Subseting by "i" and "j" + +** Subset rows in "i" + +#+begin_src R + dt1 <- flights[origin == "JFK" & month == 6L] + dt1 +#+end_src + +Get the first two rows + +#+begin_src R + flights[1:2] +#+end_src + +#+RESULTS: +: year month day dep_delay arr_delay carrier origin dest air_time distance +: 1: 2014 1 1 14 13 AA JFK LAX 359 2475 +: 2: 2014 1 1 -3 13 AA JFK LAX 363 2475 +: hour +: 1: 9 +: 2: 11 + +Ordering results of querying + +#+begin_src R + dt_filt <- flights[order(origin, -dest)] + dt_filt +#+end_src diff --git a/doc/datatable_querying_j.org b/doc/datatable_querying_j.org new file mode 100644 index 0000000..b5c1990 --- /dev/null +++ b/doc/datatable_querying_j.org @@ -0,0 +1,108 @@ +#+TITLE: Datatable package +#+DATE: 2021-10-26 +#+OPTIONS: creator:nil timestamp:nil todo:nil num:nil +#+PROPERTY: header-args:R :results output :session *Rc* :cmdline :tangle yes +#+PROPERTY: header-args:R+ :exports both +#+SETUPFILE: https://fniessen.github.io/org-html-themes/org/theme-readtheorg.setup +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: +#+HTML_HEAD: + + +#+begin_export html +
+

+ Org document with + R code +

+

+ + + +

+
+#+end_export + +#+begin_src R :exports code + rm(list = ls()) +#+end_src + + +** Select columns in "j" + +This creates a vector + +#+begin_src R + dt_c <- flights[, arr_delay] + head(dt_c) +#+end_src + +To create a data.table, use the function "~list~" or dot "~.~" + +#+begin_src R + dt_c1 <- flights[, list(arr_delay)] + head(dt_c1) +#+end_src + +The selection is wrapped in "~list~", thus, the columns can be +renamed + +#+begin_src R + dt_c2 <- flights[, .(delay2arriv = arr_delay, dist = distance)] + head(dt_c2) +#+end_src + +#+begin_src R + dt_c2a <- flights[, c("arr_delay", "distance")] + head(dt_c2a) +#+end_src + + +#+begin_src R + dt_c2b <- flights[, year:day] + head(dt_c2b) +#+end_src + +Building new colummns + +#+begin_src R + head(dt_c2b[, `:=`(time = year -1, # name = value, + pumpkin = month, + tomatoe = day +1)]) +#+end_src + +Renaming colummns + +#+begin_src R + setnames(dt_c2b, + c("time", "pumpkin", "tomatoe"),# old names + c("time_minus1", "pumpkin_month", "day_plus1")) # new names + + names(dt_c2b) +#+end_src + +#+begin_src R + setnames(dt_c2b, toupper) # new names + names(dt_c2b) +#+end_src + +Renaming colummns + +#+begin_src R + setnames(dt_c2b, tolower) # new names + names(dt_c2b) +#+end_src + +#+begin_src R + dt_c2c <- flights[, -(year:day)] + head(dt_c2c) +#+end_src + +Select columns by patterns, e.g., selecting 5 lines form columns having "del" +characters in the name: + +#+begin_src R + flights[1:5, names(flights) %like% "del", with = FALSE] +#+end_src