From aa8fa958636e000be9074a6f3949bfa661350b09 Mon Sep 17 00:00:00 2001
From: Jose <greenleaves@disroot.org>
Date: Thu, 14 Jul 2022 09:18:58 -0300
Subject: [PATCH] Building a SQLITE database - data description

---
 .gitignore           |  12 +++
 README.md            |   2 -
 README.org           |  29 +++++++
 script/db.R          | 124 +++++++++++++++++++++++++++
 script/mortal_anos.R | 196 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 361 insertions(+), 2 deletions(-)
 delete mode 100644 README.md
 create mode 100644 README.org
 create mode 100644 script/db.R
 create mode 100644 script/mortal_anos.R

diff --git a/.gitignore b/.gitignore
index cf088ec..21311b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -99,4 +99,16 @@ flycheck_*.el
 # network security
 /network-security.data
 
+# local directory
+local/
 
+# html files
+*.html
+
+# sqlite database
+*.sqlite
+
+# images
+*.pdf
+*.png
+*.jpeg
diff --git a/README.md b/README.md
deleted file mode 100644
index ca72cec..0000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# mort_geral
-
diff --git a/README.org b/README.org
new file mode 100644
index 0000000..b20a746
--- /dev/null
+++ b/README.org
@@ -0,0 +1,29 @@
+#+options: toc:nil num:nil todo:nil author:nil
+
+*  mort_geral
+:LOGBOOK:
+CLOCK: [2022-07-14 jue 08:12]--[2022-07-14 jue 08:22] =>  0:10
+CLOCK: [2022-07-13 mié 16:13]--[2022-07-13 mié 16:43] =>  0:30
+CLOCK: [2022-07-13 mié 08:44]--[2022-07-13 mié 09:14] =>  0:30
+CLOCK: [2022-07-11 lun 15:21]--[2022-07-11 lun 15:25] =>  0:04
+CLOCK: [2022-07-11 lun 14:09]--[2022-07-11 lun 14:39] =>  0:30
+CLOCK: [2022-07-08 vie 14:56]--[2022-07-08 vie 15:26] =>  0:30
+:END:
+
+Análise de indicadores de saúde: mortalidade geral da população brasileira.
+
+
+** Scripts
+
+- [[./script/db.R][Usando RSQlite para salvar dados de maior tamanho: exemplo]]
+- [[./script/mortal_anos.R][Descrição básica do indicador "m"ortalidade geral" da população brasileira 2010]]
+
+** Dados
+
+Os dados foram obtidos do Sistema de Informação de Mortalidade do Brasil ([[https://opendatasus.saude.gov.br/dataset/sim-1979-2019][SIM]]).
+O Sistema faz parte das bases de dados públicas mantidas pelo Ministério de
+Saúde do Brasil.
+
+- [[https://opendatasus.saude.gov.br/dataset/sim-1979-2019][Dados de mortalidade 1979-2019]]
+- [[https://opendatasus.saude.gov.br/dataset/sim-2020-2021][Dados de mortalidade 2021]]
+- [[https://databank.worldbank.org/metadataglossary/world-development-indicators/series/SP.DYN.CDRT.IN][Definição do indicador "mortalidade geral" - Banco Mundial]]
diff --git a/script/db.R b/script/db.R
new file mode 100644
index 0000000..c664adc
--- /dev/null
+++ b/script/db.R
@@ -0,0 +1,124 @@
+#' ---
+#' title: "Create database fo mortality indicator Brazil population"
+#' author: "José A Bran - https://ayuda.onecluster.org/"
+#' date: "2021-04-22"
+#' output:
+#'   html_document:
+#'     df_print: paged
+#'     toc: yes
+#'     toc_float: yes
+#' ---
+
+#'+ r setup, include=FALSE
+knitr::opts_chunk$set(echo = TRUE)
+
+############################################################################
+## From:                                                                  ##
+##                                                                        ##
+## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ##
+############################################################################
+
+rm(list = ls())
+
+library(DBI)
+library(RSQLite)
+library(data.table)
+library(ggplot2)
+theme_set(theme_bw())
+
+#' ## How to deal with some big data for your machine memory ("data bigger than ram")
+#'
+
+#' This database is large, thus to work with it in a local machine, a Sqlite database can be an option
+
+
+#' ## How to create a new database
+#'
+#' Check the function for more information:
+#' > ?dbConnect
+
+mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite")
+
+
+#' Disconnect:
+#' dbDisconnect(mortdb)
+
+#' unlink("mort_db.sqlite")
+
+#' ## Hoe to include a table in the Sqlite database
+#'
+#' Download the data, then load the table to be written in the workspace and use
+#' the following to include it in the database
+
+d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight
+setDT(d)
+setnames(d, tolower)
+names(d)
+
+#' You may also access the data from the cloud:
+#'
+#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
+#'
+#' d <- fread(url)
+#' setnames(d, tolower)
+#'
+#' ## Update 2022:
+#'
+#' Please note that the data has been updated since the build of this script, thus
+#' some colum names and type differ between tables
+
+dbWriteTable(mortdb, "Mortalidade_Br_2010", d)
+
+
+#' ## List the tables
+
+dbListTables(mortdb)
+
+#' ## Reading again as data.table:
+
+dt = dbReadTable(mortdb, "Mortalidade_Br_2010")
+
+setDT(dt)
+setnames(dt, tolower)
+names(dt)
+
+#' ## Disconnect
+#'
+
+dbDisconnect(mortdb)
+rm(d)
+
+#' Then, you can select the columns or lines you want to use
+#'
+
+dt = d[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)]
+
+str(dt)
+
+dt[, .N, .(idade_obito_anos, def_sexo)]
+
+#' ## Recoding dates: not reading date as date
+
+class(dt$dtobito)
+
+
+dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"),
+           idobito = as.IDate(dtobito, "%d%m%Y"))]
+
+dt[, age := year(idobito) - year(idtnasc) ] #' Age in years
+
+dt[, .N, .(year(idobito))]
+
+dt[, .N, .(year(idtnasc))]
+
+dt[, .N, (age)]
+
+#' ## Visualizing data distribution
+
+ggplot(dt, aes(age, fill = def_sexo)) +
+    geom_histogram(bins = 200) +
+    theme(legend.position = "") +
+    facet_wrap(~ def_sexo, ncol = 2)
+
+
+#' The end
diff --git a/script/mortal_anos.R b/script/mortal_anos.R
new file mode 100644
index 0000000..b175e26
--- /dev/null
+++ b/script/mortal_anos.R
@@ -0,0 +1,196 @@
+#' ---
+#' title: "Exploring data on mortality, Brazil - 2010"
+#' author: "José A Bran jose.alfredo@posgrad.ufsc.br"
+#' date: "2021-04-15"
+#' output:
+#'   html_document:
+#'     df_print: paged
+#'     toc: yes
+#'     toc_float: yes
+#' ---
+
+#'+ setup, include=FALSE
+knitr::opts_chunk$set(echo = TRUE)
+
+library(httr)
+library(read.dbc)
+library(DT)
+library(dygraphs)
+library(knitr)
+library(plotly)
+library(data.table)
+library(ggplot2)
+theme_set(theme_bw())
+
+
+#' Previously, the data was saved as RDS to reduce the weight (of data format)
+#'
+#' data <- fread("./data/ETLSIM.DORES_2010.csv"## )
+#' saveRDS(data, "ETLSIM.DORES_2010.rds")
+
+## Data for 2010
+
+rm(list = ls())
+
+#' 2021 databsae url = 'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SIM/DO21OPEN.csv'
+
+url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
+
+d = fread(url, nrows = 500) # Download all the data, but you could select columns and rows to read by 'fread' function
+
+
+###################################################
+## d <- readRDS("../data/ETLSIM.DORES_2010.rds") ##
+## setDT(d)                                      ##
+###################################################
+
+setnames(d, tolower)
+
+names(d)
+
+#' Selecting 10 colums
+#'
+cols = c('dtobito', 'dtnasc', 'sexo', 'idade_obito_anos',
+         'racacor', 'causabas_categoria', 'causabas_capitulo',
+         'res_sigla_uf', 'ocor_regiao')
+
+
+d = d[, ..cols]
+
+str(d)
+
+#' ## Data óbito - Date of death
+
+d[, .N, dtobito]
+
+d[, `:=` (idtobito = as.IDate(as.character(dtobito), "%d%m%Y"),
+          idtnasc = as.IDate(as.character(dtnasc), "%d%m%Y"))]
+
+d[, .N, idtobito]
+
+d[, .N, year(idtobito)]
+d[, .N, month(idtobito)]
+d[, .N, mday(idtobito)]
+
+testing = grep("(^20)-", d$idtobito, value = T) # Values with incomplete year
+
+d[, idtobito := gsub("^20-", "2020-", idtobito) ] # Susbtitute abnormal values in year
+
+d[, .N, year(idtnasc)]
+d[, .N, month(idtnasc)]
+d[, .N, mday(idtnasc)]
+
+
+
+d[year(idtnasc) < 1900, .(idtnasc)]
+
+testable = grep("^(9)", d$dtnasc, value = T) # abornormal values in year
+
+table(testable)
+
+d[, idtnasc := gsub("^(9)", "19", idtnasc) ] # Susbtitute abnormal values in year
+
+d[, .N, year(idtnasc)]
+
+d[, sum(is.na(idtobito))]
+
+class(d$idtobito)
+
+ggplot(d, aes(idtobito)) +
+    geom_histogram(bins = 100)
+
+
+#' ## Data nascimento - Date of birth
+
+d[, .N, dtnasc]
+
+sum(is.na(d$dtnasc))
+
+ggplot(d, aes(dtnasc)) +
+    geom_histogram(bins = 100)
+
+#' ## Sexo - Sex
+
+d[, .N, sexo]
+
+ggplot(d, aes(factor(sexo), idade)) +
+    geom_boxplot()
+
+
+#' ## Idade - Age
+
+d[, .N, idade]
+
+#' Missing data
+
+d[, sum(is.na(idade))]
+
+ggplot(d, aes(idade)) +
+    geom_histogram(bins = 200)
+
+
+p1 = ggplot(d[sexo != 'Ignorado', ],
+            aes(idade, fill = sexo)) +
+    geom_histogram(bins = 200, alpha = 0.7) +
+    labs(fill = '') +
+    theme(legend.position = c(.9, .9)) +
+    facet_wrap(~ sexo, ncol = 1)
+
+p1
+
+p2 = ggplot(d[sexo != 'Ignorado', ],
+            aes(idade, fill = sexo)) +
+    geom_histogram(bins = 200, alpha = 0.7) +
+    labs(fill = '') +
+    theme(legend.position = c(.9, .9)) +
+    facet_grid(sexo ~ racacor)
+
+p2
+
+#' ## Def Raça cor - Ethnic social representation
+#'
+#' Cor informada pelo responsável pelas informações do falecido. (1 – Branca; 2
+#' – Preta; 3 – Amarela; 4 – Parda; 5 – Indígena)
+
+d[, .N, racacor]
+
+
+#' ## Causa básica - Cause of death
+
+d[, .N, causabas]
+
+d[, .N, causabas_o]
+
+d[, .N, causabas_capitulo] # cid chapter
+
+cap = d[sexo != 'Ignorado', .N, .(causabas_capitulo, sexo)]
+
+names(cap)
+
+    count() %>%
+        mutate(Sexo = as.factor(sexo))
+
+p3 = ggplot(cap, aes(reorder(causabas_capitulo, +N), N, fill = sexo)) +
+    geom_col() +
+    coord_flip() +
+    theme_bw() +
+    labs(y = "", x = "", fill = '') +
+    facet_wrap(~ sexo)
+
+p3
+
+#' ## Sexo
+
+
+d[, .N, ocor_regiao]
+
+## d %>%
+##     count(ocor_REGIAO) %>%
+##     ggplot(aes(reorder(ocor_REGIAO, +n), n)) +
+##     geom_col() +
+##     coord_flip()
+
+
+#' ## Mortalidade por estado
+
+d[, .N,  res_sigla_uf]