From aa8fa958636e000be9074a6f3949bfa661350b09 Mon Sep 17 00:00:00 2001 From: Jose Date: Thu, 14 Jul 2022 09:18:58 -0300 Subject: [PATCH] Building a SQLITE database - data description --- .gitignore | 12 +++ README.md | 2 - README.org | 29 +++++++ script/db.R | 124 +++++++++++++++++++++++++++ script/mortal_anos.R | 196 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 361 insertions(+), 2 deletions(-) delete mode 100644 README.md create mode 100644 README.org create mode 100644 script/db.R create mode 100644 script/mortal_anos.R diff --git a/.gitignore b/.gitignore index cf088ec..21311b8 100644 --- a/.gitignore +++ b/.gitignore @@ -99,4 +99,16 @@ flycheck_*.el # network security /network-security.data +# local directory +local/ +# html files +*.html + +# sqlite database +*.sqlite + +# images +*.pdf +*.png +*.jpeg diff --git a/README.md b/README.md deleted file mode 100644 index ca72cec..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# mort_geral - diff --git a/README.org b/README.org new file mode 100644 index 0000000..b20a746 --- /dev/null +++ b/README.org @@ -0,0 +1,29 @@ +#+options: toc:nil num:nil todo:nil author:nil + +* mort_geral +:LOGBOOK: +CLOCK: [2022-07-14 jue 08:12]--[2022-07-14 jue 08:22] => 0:10 +CLOCK: [2022-07-13 mié 16:13]--[2022-07-13 mié 16:43] => 0:30 +CLOCK: [2022-07-13 mié 08:44]--[2022-07-13 mié 09:14] => 0:30 +CLOCK: [2022-07-11 lun 15:21]--[2022-07-11 lun 15:25] => 0:04 +CLOCK: [2022-07-11 lun 14:09]--[2022-07-11 lun 14:39] => 0:30 +CLOCK: [2022-07-08 vie 14:56]--[2022-07-08 vie 15:26] => 0:30 +:END: + +Análise de indicadores de saúde: mortalidade geral da população brasileira. + + +** Scripts + +- [[./script/db.R][Usando RSQlite para salvar dados de maior tamanho: exemplo]] +- [[./script/mortal_anos.R][Descrição básica do indicador "m"ortalidade geral" da população brasileira 2010]] + +** Dados + +Os dados foram obtidos do Sistema de Informação de Mortalidade do Brasil ([[https://opendatasus.saude.gov.br/dataset/sim-1979-2019][SIM]]). +O Sistema faz parte das bases de dados públicas mantidas pelo Ministério de +Saúde do Brasil. + +- [[https://opendatasus.saude.gov.br/dataset/sim-1979-2019][Dados de mortalidade 1979-2019]] +- [[https://opendatasus.saude.gov.br/dataset/sim-2020-2021][Dados de mortalidade 2021]] +- [[https://databank.worldbank.org/metadataglossary/world-development-indicators/series/SP.DYN.CDRT.IN][Definição do indicador "mortalidade geral" - Banco Mundial]] diff --git a/script/db.R b/script/db.R new file mode 100644 index 0000000..c664adc --- /dev/null +++ b/script/db.R @@ -0,0 +1,124 @@ +#' --- +#' title: "Create database fo mortality indicator Brazil population" +#' author: "José A Bran - https://ayuda.onecluster.org/" +#' date: "2021-04-22" +#' output: +#' html_document: +#' df_print: paged +#' toc: yes +#' toc_float: yes +#' --- + +#'+ r setup, include=FALSE +knitr::opts_chunk$set(echo = TRUE) + +############################################################################ +## From: ## +## ## +## https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html ## +############################################################################ + +rm(list = ls()) + +library(DBI) +library(RSQLite) +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +#' ## How to deal with some big data for your machine memory ("data bigger than ram") +#' + +#' This database is large, thus to work with it in a local machine, a Sqlite database can be an option + + +#' ## How to create a new database +#' +#' Check the function for more information: +#' > ?dbConnect + +mortdb <- dbConnect(RSQLite::SQLite(), "mort_db.sqlite") + + +#' Disconnect: +#' dbDisconnect(mortdb) + +#' unlink("mort_db.sqlite") + +#' ## Hoe to include a table in the Sqlite database +#' +#' Download the data, then load the table to be written in the workspace and use +#' the following to include it in the database + +d <- readRDS("../data/ETLSIM.DORES_2010.rds") # I saved the data as 'rds' to reduce the object weight +setDT(d) +setnames(d, tolower) +names(d) + +#' You may also access the data from the cloud: +#' +#' url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv' +#' +#' d <- fread(url) +#' setnames(d, tolower) +#' +#' ## Update 2022: +#' +#' Please note that the data has been updated since the build of this script, thus +#' some colum names and type differ between tables + +dbWriteTable(mortdb, "Mortalidade_Br_2010", d) + + +#' ## List the tables + +dbListTables(mortdb) + +#' ## Reading again as data.table: + +dt = dbReadTable(mortdb, "Mortalidade_Br_2010") + +setDT(dt) +setnames(dt, tolower) +names(dt) + +#' ## Disconnect +#' + +dbDisconnect(mortdb) +rm(d) + +#' Then, you can select the columns or lines you want to use +#' + +dt = d[, .(idade_obito_anos, def_sexo, dtobito, dtnasc)] + +str(dt) + +dt[, .N, .(idade_obito_anos, def_sexo)] + +#' ## Recoding dates: not reading date as date + +class(dt$dtobito) + + +dt[, `:=` (idtnasc = as.IDate(dtnasc, "%d%m%Y"), + idobito = as.IDate(dtobito, "%d%m%Y"))] + +dt[, age := year(idobito) - year(idtnasc) ] #' Age in years + +dt[, .N, .(year(idobito))] + +dt[, .N, .(year(idtnasc))] + +dt[, .N, (age)] + +#' ## Visualizing data distribution + +ggplot(dt, aes(age, fill = def_sexo)) + + geom_histogram(bins = 200) + + theme(legend.position = "") + + facet_wrap(~ def_sexo, ncol = 2) + + +#' The end diff --git a/script/mortal_anos.R b/script/mortal_anos.R new file mode 100644 index 0000000..b175e26 --- /dev/null +++ b/script/mortal_anos.R @@ -0,0 +1,196 @@ +#' --- +#' title: "Exploring data on mortality, Brazil - 2010" +#' author: "José A Bran jose.alfredo@posgrad.ufsc.br" +#' date: "2021-04-15" +#' output: +#' html_document: +#' df_print: paged +#' toc: yes +#' toc_float: yes +#' --- + +#'+ setup, include=FALSE +knitr::opts_chunk$set(echo = TRUE) + +library(httr) +library(read.dbc) +library(DT) +library(dygraphs) +library(knitr) +library(plotly) +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + + +#' Previously, the data was saved as RDS to reduce the weight (of data format) +#' +#' data <- fread("./data/ETLSIM.DORES_2010.csv"## ) +#' saveRDS(data, "ETLSIM.DORES_2010.rds") + +## Data for 2010 + +rm(list = ls()) + +#' 2021 databsae url = 'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SIM/DO21OPEN.csv' + +url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv' + +d = fread(url, nrows = 500) # Download all the data, but you could select columns and rows to read by 'fread' function + + +################################################### +## d <- readRDS("../data/ETLSIM.DORES_2010.rds") ## +## setDT(d) ## +################################################### + +setnames(d, tolower) + +names(d) + +#' Selecting 10 colums +#' +cols = c('dtobito', 'dtnasc', 'sexo', 'idade_obito_anos', + 'racacor', 'causabas_categoria', 'causabas_capitulo', + 'res_sigla_uf', 'ocor_regiao') + + +d = d[, ..cols] + +str(d) + +#' ## Data óbito - Date of death + +d[, .N, dtobito] + +d[, `:=` (idtobito = as.IDate(as.character(dtobito), "%d%m%Y"), + idtnasc = as.IDate(as.character(dtnasc), "%d%m%Y"))] + +d[, .N, idtobito] + +d[, .N, year(idtobito)] +d[, .N, month(idtobito)] +d[, .N, mday(idtobito)] + +testing = grep("(^20)-", d$idtobito, value = T) # Values with incomplete year + +d[, idtobito := gsub("^20-", "2020-", idtobito) ] # Susbtitute abnormal values in year + +d[, .N, year(idtnasc)] +d[, .N, month(idtnasc)] +d[, .N, mday(idtnasc)] + + + +d[year(idtnasc) < 1900, .(idtnasc)] + +testable = grep("^(9)", d$dtnasc, value = T) # abornormal values in year + +table(testable) + +d[, idtnasc := gsub("^(9)", "19", idtnasc) ] # Susbtitute abnormal values in year + +d[, .N, year(idtnasc)] + +d[, sum(is.na(idtobito))] + +class(d$idtobito) + +ggplot(d, aes(idtobito)) + + geom_histogram(bins = 100) + + +#' ## Data nascimento - Date of birth + +d[, .N, dtnasc] + +sum(is.na(d$dtnasc)) + +ggplot(d, aes(dtnasc)) + + geom_histogram(bins = 100) + +#' ## Sexo - Sex + +d[, .N, sexo] + +ggplot(d, aes(factor(sexo), idade)) + + geom_boxplot() + + +#' ## Idade - Age + +d[, .N, idade] + +#' Missing data + +d[, sum(is.na(idade))] + +ggplot(d, aes(idade)) + + geom_histogram(bins = 200) + + +p1 = ggplot(d[sexo != 'Ignorado', ], + aes(idade, fill = sexo)) + + geom_histogram(bins = 200, alpha = 0.7) + + labs(fill = '') + + theme(legend.position = c(.9, .9)) + + facet_wrap(~ sexo, ncol = 1) + +p1 + +p2 = ggplot(d[sexo != 'Ignorado', ], + aes(idade, fill = sexo)) + + geom_histogram(bins = 200, alpha = 0.7) + + labs(fill = '') + + theme(legend.position = c(.9, .9)) + + facet_grid(sexo ~ racacor) + +p2 + +#' ## Def Raça cor - Ethnic social representation +#' +#' Cor informada pelo responsável pelas informações do falecido. (1 – Branca; 2 +#' – Preta; 3 – Amarela; 4 – Parda; 5 – Indígena) + +d[, .N, racacor] + + +#' ## Causa básica - Cause of death + +d[, .N, causabas] + +d[, .N, causabas_o] + +d[, .N, causabas_capitulo] # cid chapter + +cap = d[sexo != 'Ignorado', .N, .(causabas_capitulo, sexo)] + +names(cap) + + count() %>% + mutate(Sexo = as.factor(sexo)) + +p3 = ggplot(cap, aes(reorder(causabas_capitulo, +N), N, fill = sexo)) + + geom_col() + + coord_flip() + + theme_bw() + + labs(y = "", x = "", fill = '') + + facet_wrap(~ sexo) + +p3 + +#' ## Sexo + + +d[, .N, ocor_regiao] + +## d %>% +## count(ocor_REGIAO) %>% +## ggplot(aes(reorder(ocor_REGIAO, +n), n)) + +## geom_col() + +## coord_flip() + + +#' ## Mortalidade por estado + +d[, .N, res_sigla_uf]