#' --- #' title: "Create a SQLite database for mortality data - Brazil population" #' author: "José A Bran - https://ayuda.onecluster.org/" #' date: "2021-04-22" #' output: #' html_document: #' code_folding: hide #' toc: yes #' toc_float: #' smooth_scroll: true #' highlight: zenburn #' bibliography: ../bib/references.bib # References #' csl: ../bib/apa-5th-edition.csl # Citation style language #' --- #+ setup, include=FALSE knitr::opts_chunk$set(echo = TRUE) rm(list = ls()) library(DBI) library(RSQLite) library(data.table) library(ggplot2) theme_set(theme_bw()) #'------------------------------------------------------------------------------ #' #' ## Dealing with large data #' #' ------------------------------------------------------------------------------ #' #' #' R canot handle data larger than RAM. Therefore, how "big" some data is, is #' relative to the memory and processing capacity of the machines. #' #' These population mortality data are larger than the RAM memory of most commom #' personal computers (4 to 16 Gigabytes). #' #' Thus, it would be appropriate to look for smart strategies to deal with this data. #' We may consider the following options: #' #' #' - Download the data in csv format, and reduce the size of each file #' - You may use 'rds' files in R for this purpose #' - Check "?saveRDS" help for more information #' - Work with pieces of data, extracting only columns or rows you are interested #' in #' - Explore a database solution #' - You could potentially process data weighing up to approximately 100 #' gigabytes and millions of observations. #' - There are multiple resources to work with SQL and NoSQL databases in R #' - Take a look at RPostgreSQL package for SQL integration #' - Check the "mongolite" package for NoSQL integration #' #' Learn about SQLite database can be an option to make a first approach to #' basic concepts about databases and connections in R. #' #' A great discussion about this topic can be found in this book #' [@wiley2020advanced] #' #' #' And, don't forget to take a look at this: ?saveRDS #'------------------------------------------------------------------------------ #' #' ## How to create a new SQLite database #' #' ------------------------------------------------------------------------------ ############################################################################ ## From: ## ## ## ## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ## ############################################################################ #' Check the function help in R for more information: ?dbConnect mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite") #' When you are done with the process of writing or extracting data, just #' stop the connection with the database: #' #' dbDisconnect(mortdb) #' #' unlink("mort_db.sqlite") ?dbDisconnect ?unlink #'------------------------------------------------------------------------------ #' #' ## Including tables in the SQLite database #' #'------------------------------------------------------------------------------ #' #' Download the data. Then load the table to be written in the workspace. #' #' I saved the data as 'rds' to reduce the object weight. #' #' You may also access the data from the cloud: #' #' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv' #' #' d <- fread(url) d <- readRDS("../data/ETLSIM.DORES_2010.rds") setDT(d) setnames(d, tolower) head(names(d)) #' ------------------------------------------------------------------------------ #' #' ## Update 2022: #' #'------------------------------------------------------------------------------ #' #' Please note that the data has been updated since the build of this script, thus #' some colum names and type differ between tables #' The following code is used to include the data on the database: dbWriteTable(mortdb, "Mortalidade_Br_2010", d) #'------------------------------------------------------------------------------ #' #' ## List the tables #' #' ------------------------------------------------------------------------------ dbListTables(mortdb) #'------------------------------------------------------------------------------ #' #' ## Reading again as data.table #' #' ------------------------------------------------------------------------------ dt = dbReadTable(mortdb, "Mortalidade_Br_2010") setDT(dt) setnames(dt, tolower) head(names(dt)) #' Disconnect and remove duplicated information dbDisconnect(mortdb) rm(mortdb, d) #' Then, you can select the columns or lines you want to use dt = dt[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)] str(dt) dt[, .N, .(idade_obito_anos, def_sexo)] #'------------------------------------------------------------------------------ #' #' ## Recoding dates from integers to "IDate" #' #' ------------------------------------------------------------------------------ class(dt$dtobito) dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"), idobito = as.IDate(dtobito, "%d%m%Y"))] dt[, age := year(idobito) - year(idtnasc) ] #' Age in years dt[, .N, .(year(idobito))] dt[, .N, .(year(idtnasc))] dt[, .N, (age)] #'------------------------------------------------------------------------------ #' #' ## Visualizing data distribution #' #' ------------------------------------------------------------------------------ ggplot(dt, aes(age, fill = def_sexo)) + geom_histogram(bins = 200) + theme(legend.position = "") + facet_wrap(~ def_sexo, ncol = 2) #' The end #'------------------------------------------------------------------------------ #' #' ## References #' #' ------------------------------------------------------------------------------