Building a SQLITE database - data description

2022-07-14 09:18:58 -03:00 · 2022-07-14 09:18:58 -03:00 · aa8fa95863
parent 708300da62
commit aa8fa95863
5 changed files with 361 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -99,4 +99,16 @@ flycheck_*.el
 # network security
 /network-security.data
 # local directory
 local/
 # html files
 *.html
 # sqlite database
 *.sqlite
 # images
 *.pdf
 *.png
 *.jpeg
--- a/README.md
+++ b/README.md
@ -1,2 +0,0 @@
 # mort_geral
--- a/README.org
+++ b/README.org
@ -0,0 +1,29 @@
 #+options: toc:nil num:nil todo:nil author:nil
 *  mort_geral
 :LOGBOOK:
 CLOCK: [2022-07-14 jue 08:12]--[2022-07-14 jue 08:22] =>  0:10
 CLOCK: [2022-07-13 mié 16:13]--[2022-07-13 mié 16:43] =>  0:30
 CLOCK: [2022-07-13 mié 08:44]--[2022-07-13 mié 09:14] =>  0:30
 CLOCK: [2022-07-11 lun 15:21]--[2022-07-11 lun 15:25] =>  0:04
 CLOCK: [2022-07-11 lun 14:09]--[2022-07-11 lun 14:39] =>  0:30
 CLOCK: [2022-07-08 vie 14:56]--[2022-07-08 vie 15:26] =>  0:30
 :END:
 Análise de indicadores de saúde: mortalidade geral da população brasileira.
 ** Scripts
 - [[./script/db.R][Usando RSQlite para salvar dados de maior tamanho: exemplo]]
 - [[./script/mortal_anos.R][Descrição básica do indicador "m"ortalidade geral" da população brasileira 2010]]
 ** Dados
 Os dados foram obtidos do Sistema de Informação de Mortalidade do Brasil ([[https://opendatasus.saude.gov.br/dataset/sim-1979-2019][SIM]]).
 O Sistema faz parte das bases de dados públicas mantidas pelo Ministério de
 Saúde do Brasil.
 - [[https://opendatasus.saude.gov.br/dataset/sim-1979-2019][Dados de mortalidade 1979-2019]]
 - [[https://opendatasus.saude.gov.br/dataset/sim-2020-2021][Dados de mortalidade 2021]]
 - [[https://databank.worldbank.org/metadataglossary/world-development-indicators/series/SP.DYN.CDRT.IN][Definição do indicador "mortalidade geral" - Banco Mundial]]
--- a/script/db.R
+++ b/script/db.R
@ -0,0 +1,124 @@
 #' ---
 #' title: "Create database fo mortality indicator Brazil population"
 #' author: "José A Bran - https://ayuda.onecluster.org/"
 #' date: "2021-04-22"
 #' output:
 #'   html_document:
 #'     df_print: paged
 #'     toc: yes
 #'     toc_float: yes
 #' ---
 #'+ r setup, include=FALSE
 knitr::opts_chunk$set(echo = TRUE)
 ############################################################################
 ## From:                                                                  ##
 ##                                                                        ##
 ## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
 ############################################################################
 rm(list = ls())
 library(DBI)
 library(RSQLite)
 library(data.table)
 library(ggplot2)
 theme_set(theme_bw())
 #' ## How to deal with some big data for your machine memory ("data bigger than ram")
 #'
 #' This database is large, thus to work with it in a local machine, a Sqlite database can be an option
 #' ## How to create a new database
 #'
 #' Check the function for more information:
 #' > ?dbConnect
 mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")
 #' Disconnect:
 #' dbDisconnect(mortdb)
 #' unlink("mort_db.sqlite")
 #' ## Hoe to include a table in the Sqlite database
 #'
 #' Download the data, then load the table to be written in the workspace and use
 #' the following to include it in the database
 d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight
 setDT(d)
 setnames(d, tolower)
 names(d)
 #' You may also access the data from the cloud:
 #'
 #' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
 #'
 #' d <- fread(url)
 #' setnames(d, tolower)
 #'
 #' ## Update 2022:
 #'
 #' Please note that the data has been updated since the build of this script, thus
 #' some colum names and type differ between tables
 dbWriteTable(mortdb, "Mortalidade_Br_2010", d)
 #' ## List the tables
 dbListTables(mortdb)
 #' ## Reading again as data.table:
 dt = dbReadTable(mortdb, "Mortalidade_Br_2010")
 setDT(dt)
 setnames(dt, tolower)
 names(dt)
 #' ## Disconnect
 #'
 dbDisconnect(mortdb)
 rm(d)
 #' Then, you can select the columns or lines you want to use
 #'
 dt = d[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]
 str(dt)
 dt[, .N, .(idade_obito_anos, def_sexo)]
 #' ## Recoding dates: not reading date as date
 class(dt$dtobito)
 dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"),
           idobito = as.IDate(dtobito, "%d%m%Y"))]
 dt[, age := year(idobito) - year(idtnasc) ] #' Age in years
 dt[, .N, .(year(idobito))]
 dt[, .N, .(year(idtnasc))]
 dt[, .N, (age)]
 #' ## Visualizing data distribution
 ggplot(dt, aes(age, fill = def_sexo)) +
    geom_histogram(bins = 200) +
    theme(legend.position = "") +
    facet_wrap(~ def_sexo, ncol = 2)
 #' The end
--- a/script/mortal_anos.R
+++ b/script/mortal_anos.R
@ -0,0 +1,196 @@
 #' ---
 #' title: "Exploring data on mortality, Brazil - 2010"
 #' author: "José A Bran jose.alfredo@posgrad.ufsc.br"
 #' date: "2021-04-15"
 #' output:
 #'   html_document:
 #'     df_print: paged
 #'     toc: yes
 #'     toc_float: yes
 #' ---
 #'+ setup, include=FALSE
 knitr::opts_chunk$set(echo = TRUE)
 library(httr)
 library(read.dbc)
 library(DT)
 library(dygraphs)
 library(knitr)
 library(plotly)
 library(data.table)
 library(ggplot2)
 theme_set(theme_bw())
 #' Previously, the data was saved as RDS to reduce the weight (of data format)
 #'
 #' data <- fread("./data/ETLSIM.DORES_2010.csv"## )
 #' saveRDS(data, "ETLSIM.DORES_2010.rds")
 ## Data for 2010
 rm(list = ls())
 #' 2021 databsae url = 'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SIM/DO21OPEN.csv'
 url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
 d = fread(url, nrows = 500) # Download all the data, but you could select columns and rows to read by 'fread' function
 ###################################################
 ## d <- readRDS("../data/ETLSIM.DORES_2010.rds") ##
 ## setDT(d)                                      ##
 ###################################################
 setnames(d, tolower)
 names(d)
 #' Selecting 10 colums
 #'
 cols = c('dtobito', 'dtnasc', 'sexo', 'idade_obito_anos',
         'racacor', 'causabas_categoria', 'causabas_capitulo',
         'res_sigla_uf', 'ocor_regiao')
 d = d[, ..cols]
 str(d)
 #' ## Data óbito - Date of death
 d[, .N, dtobito]
 d[, `:=` (idtobito = as.IDate(as.character(dtobito), "%d%m%Y"),
          idtnasc = as.IDate(as.character(dtnasc), "%d%m%Y"))]
 d[, .N, idtobito]
 d[, .N, year(idtobito)]
 d[, .N, month(idtobito)]
 d[, .N, mday(idtobito)]
 testing = grep("(^20)-", d$idtobito, value = T) # Values with incomplete year
 d[, idtobito := gsub("^20-", "2020-", idtobito) ] # Susbtitute abnormal values in year
 d[, .N, year(idtnasc)]
 d[, .N, month(idtnasc)]
 d[, .N, mday(idtnasc)]
 d[year(idtnasc) < 1900, .(idtnasc)]
 testable = grep("^(9)", d$dtnasc, value = T) # abornormal values in year
 table(testable)
 d[, idtnasc := gsub("^(9)", "19", idtnasc) ] # Susbtitute abnormal values in year
 d[, .N, year(idtnasc)]
 d[, sum(is.na(idtobito))]
 class(d$idtobito)
 ggplot(d, aes(idtobito)) +
    geom_histogram(bins = 100)
 #' ## Data nascimento - Date of birth
 d[, .N, dtnasc]
 sum(is.na(d$dtnasc))
 ggplot(d, aes(dtnasc)) +
    geom_histogram(bins = 100)
 #' ## Sexo - Sex
 d[, .N, sexo]
 ggplot(d, aes(factor(sexo), idade)) +
    geom_boxplot()
 #' ## Idade - Age
 d[, .N, idade]
 #' Missing data
 d[, sum(is.na(idade))]
 ggplot(d, aes(idade)) +
    geom_histogram(bins = 200)
 p1 = ggplot(d[sexo != 'Ignorado', ],
            aes(idade, fill = sexo)) +
    geom_histogram(bins = 200, alpha = 0.7) +
    labs(fill = '') +
    theme(legend.position = c(.9, .9)) +
    facet_wrap(~ sexo, ncol = 1)
 p1
 p2 = ggplot(d[sexo != 'Ignorado', ],
            aes(idade, fill = sexo)) +
    geom_histogram(bins = 200, alpha = 0.7) +
    labs(fill = '') +
    theme(legend.position = c(.9, .9)) +
    facet_grid(sexo ~ racacor)
 p2
 #' ## Def Raça cor - Ethnic social representation
 #'
 #' Cor informada pelo responsável pelas informações do falecido. (1 – Branca; 2
 #' – Preta; 3 – Amarela; 4 – Parda; 5 – Indígena)
 d[, .N, racacor]
 #' ## Causa básica - Cause of death
 d[, .N, causabas]
 d[, .N, causabas_o]
 d[, .N, causabas_capitulo] # cid chapter
 cap = d[sexo != 'Ignorado', .N, .(causabas_capitulo, sexo)]
 names(cap)
    count() %>%
        mutate(Sexo = as.factor(sexo))
 p3 = ggplot(cap, aes(reorder(causabas_capitulo, +N), N, fill = sexo)) +
    geom_col() +
    coord_flip() +
    theme_bw() +
    labs(y = "", x = "", fill = '') +
    facet_wrap(~ sexo)
 p3
 #' ## Sexo
 d[, .N, ocor_regiao]
 ## d %>%
 ##     count(ocor_REGIAO) %>%
 ##     ggplot(aes(reorder(ocor_REGIAO, +n), n)) +
 ##     geom_col() +
 ##     coord_flip()
 #' ## Mortalidade por estado
 d[, .N,  res_sigla_uf]