bib/references.bib Normal file
View file

@ -0,0 +1,20 @@
title = {Advanced R 4 Data Programming and the Cloud: Using PostgreSQL, AWS, and Shiny},
author = {Matt Wiley and Joshua F. Wiley},
year = {2020},
doi = {10.1007/978-1-4842-5973-3},
language = {English},
isbn = {9781484259726},
publisher = {Apress},
address = {United States of America},
edition = {2nd},
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2022},
url = {https://www.R-project.org/},

script/db.R Normal file → Executable file
View file

@ -1,23 +1,21 @@
#' ---
#' title: "Create database fo mortality indicator Brazil population"
#' title: "Create a SQLite database for mortality data - Brazil population"
#' author: "José A Bran - https://ayuda.onecluster.org/"
#' date: "2021-04-22"
#' output:
#' html_document:
#' df_print: paged
#' toc: yes
#' toc_float: yes
#' code_folding: hide
#' toc: yes
#' toc_float:
#' smooth_scroll: true
#' highlight: zenburn
#' bibliography: ../bib/references.bib # References
#' csl: ../bib/apa-5th-edition.csl # Citation style language
#' ---
#'+ r setup, include=FALSE
#+ setup, include=FALSE
knitr::opts_chunk$set(echo = TRUE)
## From: ##
## ##
## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
rm(list = ls())
@ -26,70 +24,140 @@ library(data.table)
#' ## How to deal with some big data for your machine memory ("data bigger than ram")
#' ## Dealing with large data
#' ------------------------------------------------------------------------------
#' R canot handle data larger than RAM. Therefore, how "big" some data is, is
#' relative to the memory and processing capacity of the machines.
#' These popultaion mortality data are larger than the RAM memory of most commom
#' personal computers (4 to 16 Gigabytes).
#' Thus, it would be appropriate to look for smart strategies to deal with this data.
#' We may consider the following options:
#' - Download the data in csv format, and reduce the size of each file
#' - You may use 'rds' files in R for this purposes
#' - Check "?saveRDS" help for more information
#' - Work with pieces of data, extracting only columns or rows you are intrested
#' in
#' - Explore a database solution
#' - There are multiple resources to word with SQL and NoSQL databases inR
#' - Take a look to RPostgreSQL package for SQL integration
#' - Check the "mongolite" package for NoSQL integration
#' Learn about SQLite database can be an option to make a first approach to
#' basic concepts about databases and connections in R.
#' This database is large, thus to work with it in a local machine, a Sqlite database can be an option
#' ## How to create a new database
#' A great discussion about this topic can be found in this book
#' [@wiley2020advanced]
#' Check the function for more information:
#' > ?dbConnect
#' And, take a look to this:
#' ## How to create a new SQLite database
#' ------------------------------------------------------------------------------
## From: ##
## ##
## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
#' Check the function help in R for more information:
mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")
#' Disconnect:
#' When you are done with the process of writing or extracting data, just
#' stop the connection with the database:
#' dbDisconnect(mortdb)
#' unlink("mort_db.sqlite")
#' ## Hoe to include a table in the Sqlite database
#' ## Including tables in the SQLite database
#' Download the data, then load the table to be written in the workspace and use #' the following to include it in the database.
#' I saved the data as 'rds' to reduce the object weight.
#' Download the data, then load the table to be written in the workspace and use
#' the following to include it in the database
d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight
setnames(d, tolower)
#' You may also access the data from the cloud:
#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
#' d <- fread(url)
#' setnames(d, tolower)
d <- readRDS("../data/ETLSIM.DORES_2010.rds")
setnames(d, tolower)
#' ------------------------------------------------------------------------------
#' ## Update 2022:
#' Please note that the data has been updated since the build of this script, thus
#' some colum names and type differ between tables
dbWriteTable(mortdb, "Mortalidade_Br_2010", d)
#' ## List the tables
#' ------------------------------------------------------------------------------
#' ## Reading again as data.table:
#' ## Reading again as data.table
#' ------------------------------------------------------------------------------
dt = dbReadTable(mortdb, "Mortalidade_Br_2010")
setnames(dt, tolower)
#' ## Disconnect
#' Disconnect and remove duplicated information
rm(mortdb, d)
#' Then, you can select the columns or lines you want to use
dt = dt[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]
@ -97,7 +165,11 @@ str(dt)
dt[, .N, .(idade_obito_anos, def_sexo)]
#' ## Recoding dates: not reading date as date
#' ## Recoding dates from integers to "IDate"
#' ------------------------------------------------------------------------------
@ -113,7 +185,11 @@ dt[, .N, .(year(idtnasc))]
dt[, .N, (age)]
#' ## Visualizing data distribution
#' ------------------------------------------------------------------------------
ggplot(dt, aes(age, fill = def_sexo)) +
geom_histogram(bins = 200) +
@ -122,3 +198,9 @@ ggplot(dt, aes(age, fill = def_sexo)) +
#' The end
#' ## References
#' ------------------------------------------------------------------------------