#' --- #' title: "Create database fo mortality indicator Brazil population" #' author: "José A Bran - https://ayuda.onecluster.org/" #' date: "2021-04-22" #' output: #' html_document: #' df_print: paged #' toc: yes #' toc_float: yes #' --- #'+ r setup, include=FALSE knitr::opts_chunk$set(echo = TRUE) ############################################################################ ## From: ## ## ## ## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ## ############################################################################ rm(list = ls()) library(DBI) library(RSQLite) library(data.table) library(ggplot2) theme_set(theme_bw()) #' ## How to deal with some big data for your machine memory ("data bigger than ram") #' #' This database is large, thus to work with it in a local machine, a Sqlite database can be an option #' ## How to create a new database #' #' Check the function for more information: #' > ?dbConnect mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite") #' Disconnect: #' dbDisconnect(mortdb) #' unlink("mort_db.sqlite") #' ## Hoe to include a table in the Sqlite database #' #' Download the data, then load the table to be written in the workspace and use #' the following to include it in the database d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight setDT(d) setnames(d, tolower) names(d) #' You may also access the data from the cloud: #' #' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv' #' #' d <- fread(url) #' setnames(d, tolower) #' #' ## Update 2022: #' #' Please note that the data has been updated since the build of this script, thus #' some colum names and type differ between tables dbWriteTable(mortdb, "Mortalidade_Br_2010", d) #' ## List the tables dbListTables(mortdb) #' ## Reading again as data.table: dt = dbReadTable(mortdb, "Mortalidade_Br_2010") setDT(dt) setnames(dt, tolower) names(dt) #' ## Disconnect #' dbDisconnect(mortdb) rm(d) #' Then, you can select the columns or lines you want to use #' dt = d[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)] str(dt) dt[, .N, .(idade_obito_anos, def_sexo)] #' ## Recoding dates: not reading date as date class(dt$dtobito) dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"), idobito = as.IDate(dtobito, "%d%m%Y"))] dt[, age := year(idobito) - year(idtnasc) ] #' Age in years dt[, .N, .(year(idobito))] dt[, .N, .(year(idtnasc))] dt[, .N, (age)] #' ## Visualizing data distribution ggplot(dt, aes(age, fill = def_sexo)) + geom_histogram(bins = 200) + theme(legend.position = "") + facet_wrap(~ def_sexo, ncol = 2) #' The end