2022-07-14 14:18:58 +02:00
|
|
|
#' ---
|
2022-07-15 19:54:28 +02:00
|
|
|
#' title: "Create a SQLite database for mortality data - Brazil population"
|
2022-07-14 14:18:58 +02:00
|
|
|
#' author: "José A Bran - https://ayuda.onecluster.org/"
|
|
|
|
#' date: "2021-04-22"
|
|
|
|
#' output:
|
|
|
|
#' html_document:
|
2022-07-15 19:54:28 +02:00
|
|
|
#' code_folding: hide
|
|
|
|
#' toc: yes
|
|
|
|
#' toc_float:
|
|
|
|
#' smooth_scroll: true
|
|
|
|
#' highlight: zenburn
|
|
|
|
#' bibliography: ../bib/references.bib # References
|
|
|
|
#' csl: ../bib/apa-5th-edition.csl # Citation style language
|
2022-07-14 14:18:58 +02:00
|
|
|
#' ---
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#+ setup, include=FALSE
|
2022-07-14 14:18:58 +02:00
|
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
|
|
|
|
|
|
rm(list = ls())
|
|
|
|
|
|
|
|
library(DBI)
|
|
|
|
library(RSQLite)
|
|
|
|
library(data.table)
|
|
|
|
library(ggplot2)
|
|
|
|
theme_set(theme_bw())
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
|
|
|
#' ## Dealing with large data
|
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|
|
|
|
#'
|
|
|
|
#'
|
|
|
|
#' R canot handle data larger than RAM. Therefore, how "big" some data is, is
|
|
|
|
#' relative to the memory and processing capacity of the machines.
|
|
|
|
#'
|
2022-07-16 02:41:20 +02:00
|
|
|
#' These population mortality data are larger than the RAM memory of most commom
|
2022-07-15 19:54:28 +02:00
|
|
|
#' personal computers (4 to 16 Gigabytes).
|
|
|
|
#'
|
|
|
|
#' Thus, it would be appropriate to look for smart strategies to deal with this data.
|
|
|
|
|
|
|
|
|
|
|
|
#' We may consider the following options:
|
|
|
|
#'
|
|
|
|
#'
|
|
|
|
#' - Download the data in csv format, and reduce the size of each file
|
2022-07-16 02:41:20 +02:00
|
|
|
#' - You may use 'rds' files in R for this purpose
|
2022-07-15 19:54:28 +02:00
|
|
|
#' - Check "?saveRDS" help for more information
|
2022-07-16 02:41:20 +02:00
|
|
|
#' - Work with pieces of data, extracting only columns or rows you are interested
|
2022-07-15 19:54:28 +02:00
|
|
|
#' in
|
|
|
|
#' - Explore a database solution
|
2022-07-16 19:12:29 +02:00
|
|
|
#' - You could potentially process data weighing up to approximately 100
|
|
|
|
#' gigabytes and millions of observations.
|
2022-07-16 02:41:20 +02:00
|
|
|
#' - There are multiple resources to work with SQL and NoSQL databases in R
|
|
|
|
#' - Take a look at RPostgreSQL package for SQL integration
|
2022-07-15 19:54:28 +02:00
|
|
|
#' - Check the "mongolite" package for NoSQL integration
|
|
|
|
#'
|
|
|
|
#' Learn about SQLite database can be an option to make a first approach to
|
|
|
|
#' basic concepts about databases and connections in R.
|
|
|
|
#'
|
|
|
|
|
|
|
|
#' A great discussion about this topic can be found in this book
|
|
|
|
#' [@wiley2020advanced]
|
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#'
|
2022-07-16 02:41:20 +02:00
|
|
|
#' And, don't forget to take a look at this:
|
2022-07-14 14:18:58 +02:00
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
?saveRDS
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
2022-07-14 14:18:58 +02:00
|
|
|
#'
|
2022-07-15 19:54:28 +02:00
|
|
|
#' ## How to create a new SQLite database
|
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
############################################################################
|
|
|
|
## From: ##
|
|
|
|
## ##
|
|
|
|
## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
|
|
|
|
############################################################################
|
|
|
|
|
|
|
|
#' Check the function help in R for more information:
|
|
|
|
|
|
|
|
?dbConnect
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")
|
|
|
|
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#' When you are done with the process of writing or extracting data, just
|
|
|
|
#' stop the connection with the database:
|
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#' dbDisconnect(mortdb)
|
2022-07-15 19:54:28 +02:00
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#' unlink("mort_db.sqlite")
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
?dbDisconnect
|
2022-07-14 14:18:58 +02:00
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
?unlink
|
2022-07-14 14:18:58 +02:00
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
|
|
|
#' ## Including tables in the SQLite database
|
|
|
|
#'
|
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
2022-07-16 02:41:20 +02:00
|
|
|
#' Download the data. Then load the table to be written in the workspace.
|
2022-07-15 19:54:28 +02:00
|
|
|
#'
|
|
|
|
#' I saved the data as 'rds' to reduce the object weight.
|
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#' You may also access the data from the cloud:
|
|
|
|
#'
|
|
|
|
#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
|
|
|
|
#'
|
|
|
|
#' d <- fread(url)
|
2022-07-15 19:54:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
d <- readRDS("../data/ETLSIM.DORES_2010.rds")
|
|
|
|
|
|
|
|
setDT(d)
|
|
|
|
|
|
|
|
setnames(d, tolower)
|
|
|
|
|
|
|
|
head(names(d))
|
|
|
|
|
|
|
|
#' ------------------------------------------------------------------------------
|
2022-07-14 14:18:58 +02:00
|
|
|
#'
|
|
|
|
#' ## Update 2022:
|
|
|
|
#'
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#' Please note that the data has been updated since the build of this script, thus
|
|
|
|
#' some colum names and type differ between tables
|
|
|
|
|
2022-07-16 02:41:20 +02:00
|
|
|
|
|
|
|
#' The following code is used to include the data on the database:
|
|
|
|
|
2022-07-14 14:18:58 +02:00
|
|
|
dbWriteTable(mortdb, "Mortalidade_Br_2010", d)
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#' ## List the tables
|
2022-07-15 19:54:28 +02:00
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
dbListTables(mortdb)
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
|
|
|
#' ## Reading again as data.table
|
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
dt = dbReadTable(mortdb, "Mortalidade_Br_2010")
|
|
|
|
|
|
|
|
setDT(dt)
|
|
|
|
setnames(dt, tolower)
|
2022-07-15 19:54:28 +02:00
|
|
|
head(names(dt))
|
2022-07-14 14:18:58 +02:00
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#' Disconnect and remove duplicated information
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
dbDisconnect(mortdb)
|
2022-07-15 19:54:28 +02:00
|
|
|
rm(mortdb, d)
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
#' Then, you can select the columns or lines you want to use
|
|
|
|
|
2022-07-14 15:33:34 +02:00
|
|
|
dt = dt[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
str(dt)
|
|
|
|
|
|
|
|
dt[, .N, .(idade_obito_anos, def_sexo)]
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
|
|
|
#' ## Recoding dates from integers to "IDate"
|
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
class(dt$dtobito)
|
|
|
|
|
|
|
|
|
|
|
|
dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"),
|
|
|
|
idobito = as.IDate(dtobito, "%d%m%Y"))]
|
|
|
|
|
|
|
|
dt[, age := year(idobito) - year(idtnasc) ] #' Age in years
|
|
|
|
|
|
|
|
dt[, .N, .(year(idobito))]
|
|
|
|
|
|
|
|
dt[, .N, .(year(idtnasc))]
|
|
|
|
|
|
|
|
dt[, .N, (age)]
|
|
|
|
|
2022-07-15 19:54:28 +02:00
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
2022-07-14 14:18:58 +02:00
|
|
|
#' ## Visualizing data distribution
|
2022-07-15 19:54:28 +02:00
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|
2022-07-14 14:18:58 +02:00
|
|
|
|
|
|
|
ggplot(dt, aes(age, fill = def_sexo)) +
|
|
|
|
geom_histogram(bins = 200) +
|
|
|
|
theme(legend.position = "") +
|
|
|
|
facet_wrap(~ def_sexo, ncol = 2)
|
|
|
|
|
|
|
|
|
|
|
|
#' The end
|
2022-07-15 19:54:28 +02:00
|
|
|
|
|
|
|
#'------------------------------------------------------------------------------
|
|
|
|
#'
|
|
|
|
#' ## References
|
|
|
|
#'
|
|
|
|
#' ------------------------------------------------------------------------------
|