From 711047886352d8648f5cd35c86c3cb68865f9563 Mon Sep 17 00:00:00 2001 From: Jose Date: Fri, 15 Jul 2022 14:54:28 -0300 Subject: [PATCH] Updating script SQLite-including references --- .gitignore | 0 LICENSE | 0 README.org | 13 -- bib/apa-5th-edition.csl | 341 ++++++++++++++++++++++++++++++++++++++++ bib/references.bib | 20 +++ script/db.R | 158 ++++++++++++++----- 6 files changed, 481 insertions(+), 51 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.org create mode 100755 bib/apa-5th-edition.csl create mode 100644 bib/references.bib mode change 100644 => 100755 script/db.R diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.org b/README.org old mode 100644 new mode 100755 index 4d4efb4..d1abceb --- a/README.org +++ b/README.org @@ -3,19 +3,6 @@ * mort_geral -** TODO [#A] update code :noexport: -:LOGBOOK: -CLOCK: [2022-07-14 jue 08:12]--[2022-07-14 jue 08:22] => 0:10 -CLOCK: [2022-07-13 mié 16:13]--[2022-07-13 mié 16:43] => 0:30 -CLOCK: [2022-07-13 mié 08:44]--[2022-07-13 mié 09:14] => 0:30 -CLOCK: [2022-07-11 lun 15:21]--[2022-07-11 lun 15:25] => 0:04 -CLOCK: [2022-07-11 lun 14:09]--[2022-07-11 lun 14:39] => 0:30 -CLOCK: [2022-07-08 vie 14:56]--[2022-07-08 vie 15:26] => 0:30 -:END: - -Análise de indicadores de saúde: mortalidade geral da população brasileira. - - ** Scripts - [[./script/db.R][Usando RSQlite para salvar dados de maior tamanho: exemplo]] diff --git a/bib/apa-5th-edition.csl b/bib/apa-5th-edition.csl new file mode 100755 index 0000000..186557d --- /dev/null +++ b/bib/apa-5th-edition.csl @@ -0,0 +1,341 @@ + + diff --git a/bib/references.bib b/bib/references.bib new file mode 100644 index 0000000..977e7f3 --- /dev/null +++ b/bib/references.bib @@ -0,0 +1,20 @@ +@book{wiley2020advanced, +title = {Advanced R 4 Data Programming and the Cloud: Using PostgreSQL, AWS, and Shiny}, +author = {Matt Wiley and Joshua F. Wiley}, +year = {2020}, +doi = {10.1007/978-1-4842-5973-3}, +language = {English}, +isbn = {9781484259726}, +publisher = {Apress}, +address = {United States of America}, +edition = {2nd}, +} + +@Manual{R, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2022}, + url = {https://www.R-project.org/}, + } \ No newline at end of file diff --git a/script/db.R b/script/db.R old mode 100644 new mode 100755 index 8997013..c40cf43 --- a/script/db.R +++ b/script/db.R @@ -1,23 +1,21 @@ #' --- -#' title: "Create database fo mortality indicator Brazil population" +#' title: "Create a SQLite database for mortality data - Brazil population" #' author: "José A Bran - https://ayuda.onecluster.org/" #' date: "2021-04-22" #' output: #' html_document: -#' df_print: paged -#' toc: yes -#' toc_float: yes +#' code_folding: hide +#' toc: yes +#' toc_float: +#' smooth_scroll: true +#' highlight: zenburn +#' bibliography: ../bib/references.bib # References +#' csl: ../bib/apa-5th-edition.csl # Citation style language #' --- -#'+ r setup, include=FALSE +#+ setup, include=FALSE knitr::opts_chunk$set(echo = TRUE) -############################################################################ -## From: ## -## ## -## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ## -############################################################################ - rm(list = ls()) library(DBI) @@ -26,70 +24,140 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -#' ## How to deal with some big data for your machine memory ("data bigger than ram") +#'------------------------------------------------------------------------------ +#' +#' ## Dealing with large data +#' +#' ------------------------------------------------------------------------------ +#' +#' +#' R canot handle data larger than RAM. Therefore, how "big" some data is, is +#' relative to the memory and processing capacity of the machines. +#' +#' These popultaion mortality data are larger than the RAM memory of most commom +#' personal computers (4 to 16 Gigabytes). +#' +#' Thus, it would be appropriate to look for smart strategies to deal with this data. + + +#' We may consider the following options: +#' +#' +#' - Download the data in csv format, and reduce the size of each file +#' - You may use 'rds' files in R for this purposes +#' - Check "?saveRDS" help for more information +#' - Work with pieces of data, extracting only columns or rows you are intrested +#' in +#' - Explore a database solution +#' - There are multiple resources to word with SQL and NoSQL databases inR +#' - Take a look to RPostgreSQL package for SQL integration +#' - Check the "mongolite" package for NoSQL integration +#' +#' Learn about SQLite database can be an option to make a first approach to +#' basic concepts about databases and connections in R. #' -#' This database is large, thus to work with it in a local machine, a Sqlite database can be an option - - -#' ## How to create a new database +#' A great discussion about this topic can be found in this book +#' [@wiley2020advanced] #' -#' Check the function for more information: -#' > ?dbConnect +#' +#' And, take a look to this: + +?saveRDS + + +#'------------------------------------------------------------------------------ +#' +#' ## How to create a new SQLite database +#' +#' ------------------------------------------------------------------------------ + +############################################################################ +## From: ## +## ## +## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ## +############################################################################ + +#' Check the function help in R for more information: + +?dbConnect mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite") -#' Disconnect: +#' When you are done with the process of writing or extracting data, just +#' stop the connection with the database: +#' #' dbDisconnect(mortdb) - +#' #' unlink("mort_db.sqlite") -#' ## Hoe to include a table in the Sqlite database +?dbDisconnect + +?unlink + +#'------------------------------------------------------------------------------ +#' +#' ## Including tables in the SQLite database +#' +#'------------------------------------------------------------------------------ +#' +#' Download the data, then load the table to be written in the workspace and use #' the following to include it in the database. +#' +#' I saved the data as 'rds' to reduce the object weight. #' -#' Download the data, then load the table to be written in the workspace and use -#' the following to include it in the database - -d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight -setDT(d) -setnames(d, tolower) -names(d) - #' You may also access the data from the cloud: #' #' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv' #' #' d <- fread(url) -#' setnames(d, tolower) + + +d <- readRDS("../data/ETLSIM.DORES_2010.rds") + +setDT(d) + +setnames(d, tolower) + +head(names(d)) + +#' ------------------------------------------------------------------------------ #' #' ## Update 2022: #' +#'------------------------------------------------------------------------------ +#' #' Please note that the data has been updated since the build of this script, thus #' some colum names and type differ between tables dbWriteTable(mortdb, "Mortalidade_Br_2010", d) - +#'------------------------------------------------------------------------------ +#' #' ## List the tables +#' +#' ------------------------------------------------------------------------------ dbListTables(mortdb) -#' ## Reading again as data.table: +#'------------------------------------------------------------------------------ +#' +#' ## Reading again as data.table +#' +#' ------------------------------------------------------------------------------ dt = dbReadTable(mortdb, "Mortalidade_Br_2010") setDT(dt) setnames(dt, tolower) -names(dt) +head(names(dt)) -#' ## Disconnect -#' +#' Disconnect and remove duplicated information dbDisconnect(mortdb) -rm(d) +rm(mortdb, d) #' Then, you can select the columns or lines you want to use -#' dt = dt[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)] @@ -97,7 +165,11 @@ str(dt) dt[, .N, .(idade_obito_anos, def_sexo)] -#' ## Recoding dates: not reading date as date +#'------------------------------------------------------------------------------ +#' +#' ## Recoding dates from integers to "IDate" +#' +#' ------------------------------------------------------------------------------ class(dt$dtobito) @@ -113,7 +185,11 @@ dt[, .N, .(year(idtnasc))] dt[, .N, (age)] +#'------------------------------------------------------------------------------ +#' #' ## Visualizing data distribution +#' +#' ------------------------------------------------------------------------------ ggplot(dt, aes(age, fill = def_sexo)) + geom_histogram(bins = 200) + @@ -122,3 +198,9 @@ ggplot(dt, aes(age, fill = def_sexo)) + #' The end + +#'------------------------------------------------------------------------------ +#' +#' ## References +#' +#' ------------------------------------------------------------------------------