mort_geral/script/db.R

#' ---
#' title: "Create database fo mortality indicator Brazil population"
#' author: "José A Bran - https://ayuda.onecluster.org/"
#' date: "2021-04-22"
#' output:
#'   html_document:
#'     df_print: paged
#'     toc: yes
#'     toc_float: yes
#' ---

#'+ r setup, include=FALSE
knitr::opts_chunk$set(echo = TRUE)

############################################################################
## From:                                                                  ##
##                                                                        ##
## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
############################################################################

rm(list = ls())

library(DBI)
library(RSQLite)
library(data.table)
library(ggplot2)
theme_set(theme_bw())

#' ## How to deal with some big data for your machine memory ("data bigger than ram")
#'

#' This database is large, thus to work with it in a local machine, a Sqlite database can be an option


#' ## How to create a new database
#'
#' Check the function for more information:
#' > ?dbConnect

mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")


#' Disconnect:
#' dbDisconnect(mortdb)

#' unlink("mort_db.sqlite")

#' ## Hoe to include a table in the Sqlite database
#'
#' Download the data, then load the table to be written in the workspace and use
#' the following to include it in the database

d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight
setDT(d)
setnames(d, tolower)
names(d)

#' You may also access the data from the cloud:
#'
#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
#'
#' d <- fread(url)
#' setnames(d, tolower)
#'
#' ## Update 2022:
#'
#' Please note that the data has been updated since the build of this script, thus
#' some colum names and type differ between tables

dbWriteTable(mortdb, "Mortalidade_Br_2010", d)


#' ## List the tables

dbListTables(mortdb)

#' ## Reading again as data.table:

dt = dbReadTable(mortdb, "Mortalidade_Br_2010")

setDT(dt)
setnames(dt, tolower)
names(dt)

#' ## Disconnect
#'

dbDisconnect(mortdb)
rm(d)

#' Then, you can select the columns or lines you want to use
#'

dt = d[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]

str(dt)

dt[, .N, .(idade_obito_anos, def_sexo)]

#' ## Recoding dates: not reading date as date

class(dt$dtobito)


dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"),
           idobito = as.IDate(dtobito, "%d%m%Y"))]

dt[, age := year(idobito) - year(idtnasc) ] #' Age in years

dt[, .N, .(year(idobito))]

dt[, .N, .(year(idtnasc))]

dt[, .N, (age)]

#' ## Visualizing data distribution

ggplot(dt, aes(age, fill = def_sexo)) +
    geom_histogram(bins = 200) +
    theme(legend.position = "") +
    facet_wrap(~ def_sexo, ncol = 2)


#' The end