mort_geral/script/db.R

#' ---
#' title: "Create a SQLite database for mortality data - Brazil population"
#' author: "José A Bran - https://ayuda.onecluster.org/"
#' date: "2021-04-22"
#' output:
#'   html_document:
#'    code_folding: hide
#'    toc: yes
#'    toc_float:
#'      smooth_scroll: true
#'    highlight: zenburn
#' bibliography: ../bib/references.bib  # References
#' csl: ../bib/apa-5th-edition.csl # Citation style language
#' ---

#+ setup, include=FALSE
knitr::opts_chunk$set(echo = TRUE)

rm(list = ls())

library(DBI)
library(RSQLite)
library(data.table)
library(ggplot2)
theme_set(theme_bw())

#'------------------------------------------------------------------------------
#'
#' ## Dealing with large data
#'
#' ------------------------------------------------------------------------------
#'
#'
#' R canot handle data larger than RAM. Therefore, how "big" some data is, is
#' relative to the memory and processing capacity of the machines.
#'
#' These population mortality data are larger than the RAM memory of most commom
#' personal computers (4 to 16 Gigabytes).
#'
#' Thus, it would be appropriate to look for smart strategies to deal with this data.


#' We may consider the following options:
#'
#'
#' - Download the data in csv format, and reduce the size of each file
#'  - You may use 'rds' files in R for this purpose
#'  - Check "?saveRDS" help for more information
#' - Work with pieces of data, extracting only columns or rows you are interested
#' in
#' - Explore a database solution
#'  - You could potentially process data weighing up to approximately 100
#' gigabytes and millions of observations.
#'  - There are multiple resources to work with SQL and NoSQL databases in R
#'  - Take a look at RPostgreSQL package for SQL integration
#'  - Check the "mongolite" package for NoSQL integration
#'
#' Learn about SQLite database can be an option to make a first approach to
#' basic concepts about databases and connections in R.
#'

#' A great discussion about this topic can be found in this book
#' [@wiley2020advanced]
#'
#'
#' And, don't forget to take a look at this:

?saveRDS


#'------------------------------------------------------------------------------
#'
#' ## How to create a new SQLite database
#'
#' ------------------------------------------------------------------------------

############################################################################
## From:                                                                  ##
##                                                                        ##
## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
############################################################################

#' Check the function help in R for more information:

?dbConnect

mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")


#' When you are done with the process of writing or extracting data, just
#' stop the connection with the database:
#'
#' dbDisconnect(mortdb)
#'
#' unlink("mort_db.sqlite")

?dbDisconnect

?unlink

#'------------------------------------------------------------------------------
#'
#' ## Including tables in the SQLite database
#'
#'------------------------------------------------------------------------------
#'
#' Download the data. Then load the table to be written in the workspace.
#'
#' I saved the data as 'rds' to reduce the object weight.
#'
#' You may also access the data from the cloud:
#'
#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
#'
#' d <- fread(url)


d <- readRDS("../data/ETLSIM.DORES_2010.rds")

setDT(d)

setnames(d, tolower)

head(names(d))

#' ------------------------------------------------------------------------------
#'
#' ## Update 2022:
#'
#'------------------------------------------------------------------------------
#'
#' Please note that the data has been updated since the build of this script, thus
#' some colum names and type differ between tables


#' The following code is used to include the data on the database:

dbWriteTable(mortdb, "Mortalidade_Br_2010", d)

#'------------------------------------------------------------------------------
#'
#' ## List the tables
#'
#' ------------------------------------------------------------------------------

dbListTables(mortdb)

#'------------------------------------------------------------------------------
#'
#' ## Reading again as data.table
#'
#' ------------------------------------------------------------------------------

dt = dbReadTable(mortdb, "Mortalidade_Br_2010")

setDT(dt)
setnames(dt, tolower)
head(names(dt))

#' Disconnect and remove duplicated information

dbDisconnect(mortdb)
rm(mortdb, d)

#' Then, you can select the columns or lines you want to use

dt = dt[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]

str(dt)

dt[, .N, .(idade_obito_anos, def_sexo)]

#'------------------------------------------------------------------------------
#'
#' ## Recoding dates from integers to "IDate"
#'
#' ------------------------------------------------------------------------------

class(dt$dtobito)


dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"),
           idobito = as.IDate(dtobito, "%d%m%Y"))]

dt[, age := year(idobito) - year(idtnasc) ] #' Age in years

dt[, .N, .(year(idobito))]

dt[, .N, .(year(idtnasc))]

dt[, .N, (age)]

#'------------------------------------------------------------------------------
#'
#' ## Visualizing data distribution
#'
#' ------------------------------------------------------------------------------

ggplot(dt, aes(age, fill = def_sexo)) +
    geom_histogram(bins = 200) +
    theme(legend.position = "") +
    facet_wrap(~ def_sexo, ncol = 2)


#' The end

#'------------------------------------------------------------------------------
#'
#' ## References
#'
#' ------------------------------------------------------------------------------
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' ---`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' title: "Create a SQLite database for mortality data - Brazil population"`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' author: "José A Bran - https://ayuda.onecluster.org/"`
			`#' date: "2021-04-22"`
			`#' output:`
			`#' html_document:`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' code_folding: hide`
			`#' toc: yes`
			`#' toc_float:`
			`#' smooth_scroll: true`
			`#' highlight: zenburn`
			`#' bibliography: ../bib/references.bib # References`
			`#' csl: ../bib/apa-5th-edition.csl # Citation style language`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' ---`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#+ setup, include=FALSE`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`knitr::opts_chunk$set(echo = TRUE)`

			`rm(list = ls())`

			`library(DBI)`
			`library(RSQLite)`
			`library(data.table)`
			`library(ggplot2)`
			`theme_set(theme_bw())`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
			`#' ## Dealing with large data`
			`#'`
			`#' ------------------------------------------------------------------------------`
			`#'`
			`#'`
			`#' R canot handle data larger than RAM. Therefore, how "big" some data is, is`
			`#' relative to the memory and processing capacity of the machines.`
			`#'`
Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00			`#' These population mortality data are larger than the RAM memory of most commom`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' personal computers (4 to 16 Gigabytes).`
			`#'`
			`#' Thus, it would be appropriate to look for smart strategies to deal with this data.`


			`#' We may consider the following options:`
			`#'`
			`#'`
			`#' - Download the data in csv format, and reduce the size of each file`
Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00			`#' - You may use 'rds' files in R for this purpose`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' - Check "?saveRDS" help for more information`
Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00			`#' - Work with pieces of data, extracting only columns or rows you are interested`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' in`
			`#' - Explore a database solution`
Adding a script on mortality and age 2022-07-16 19:12:29 +02:00			`#' - You could potentially process data weighing up to approximately 100`
			`#' gigabytes and millions of observations.`
Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00			`#' - There are multiple resources to work with SQL and NoSQL databases in R`
			`#' - Take a look at RPostgreSQL package for SQL integration`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' - Check the "mongolite" package for NoSQL integration`
			`#'`
			`#' Learn about SQLite database can be an option to make a first approach to`
			`#' basic concepts about databases and connections in R.`
			`#'`

			`#' A great discussion about this topic can be found in this book`
			`#' [@wiley2020advanced]`
			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#'`
Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00			`#' And, don't forget to take a look at this:`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`?saveRDS`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#'`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' ## How to create a new SQLite database`
			`#'`
			`#' ------------------------------------------------------------------------------`

			`############################################################################`
			`## From: ##`
			`## ##`
			`## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##`
			`############################################################################`

			`#' Check the function help in R for more information:`

			`?dbConnect`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")`


Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' When you are done with the process of writing or extracting data, just`
			`#' stop the connection with the database:`
			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' dbDisconnect(mortdb)`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' unlink("mort_db.sqlite")`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`?dbDisconnect`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`?unlink`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
			`#' ## Including tables in the SQLite database`
			`#'`
			`#'------------------------------------------------------------------------------`
			`#'`
Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00			`#' Download the data. Then load the table to be written in the workspace.`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'`
			`#' I saved the data as 'rds' to reduce the object weight.`
			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' You may also access the data from the cloud:`
			`#'`
			`#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'`
			`#'`
			`#' d <- fread(url)`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00

			`d <- readRDS("../data/ETLSIM.DORES_2010.rds")`

			`setDT(d)`

			`setnames(d, tolower)`

			`head(names(d))`

			`#' ------------------------------------------------------------------------------`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#'`
			`#' ## Update 2022:`
			`#'`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' Please note that the data has been updated since the build of this script, thus`
			`#' some colum names and type differ between tables`

Adding dashboard with mortality rate by causes - CID 10 2022-07-16 02:41:20 +02:00
			`#' The following code is used to include the data on the database:`

Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`dbWriteTable(mortdb, "Mortalidade_Br_2010", d)`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' ## List the tables`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'`
			`#' ------------------------------------------------------------------------------`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`dbListTables(mortdb)`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
			`#' ## Reading again as data.table`
			`#'`
			`#' ------------------------------------------------------------------------------`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`dt = dbReadTable(mortdb, "Mortalidade_Br_2010")`

			`setDT(dt)`
			`setnames(dt, tolower)`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`head(names(dt))`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#' Disconnect and remove duplicated information`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`dbDisconnect(mortdb)`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`rm(mortdb, d)`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`#' Then, you can select the columns or lines you want to use`

Fixing typos 2022-07-14 15:33:34 +02:00			`dt = dt[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`str(dt)`

			`dt[, .N, .(idade_obito_anos, def_sexo)]`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
			`#' ## Recoding dates from integers to "IDate"`
			`#'`
			`#' ------------------------------------------------------------------------------`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`class(dt$dtobito)`


			dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"),
			`idobito = as.IDate(dtobito, "%d%m%Y"))]`

			`dt[, age := year(idobito) - year(idtnasc) ] #' Age in years`

			`dt[, .N, .(year(idobito))]`

			`dt[, .N, .(year(idtnasc))]`

			`dt[, .N, (age)]`

Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'------------------------------------------------------------------------------`
			`#'`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00			`#' ## Visualizing data distribution`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00			`#'`
			`#' ------------------------------------------------------------------------------`
Building a SQLITE database - data description 2022-07-14 14:18:58 +02:00
			`ggplot(dt, aes(age, fill = def_sexo)) +`
			`geom_histogram(bins = 200) +`
			`theme(legend.position = "") +`
			`facet_wrap(~ def_sexo, ncol = 2)`


			`#' The end`
Updating script SQLite-including references 2022-07-15 19:54:28 +02:00
			`#'------------------------------------------------------------------------------`
			`#'`
			`#' ## References`
			`#'`
			`#' ------------------------------------------------------------------------------`