mort_geral/script/mortal_anos.R

197 lines
3.9 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#' ---
#' title: "Exploring data on mortality, Brazil - 2010"
#' author: "José A Bran jose.alfredo@posgrad.ufsc.br"
#' date: "2021-04-15"
#' output:
#' html_document:
#' df_print: paged
#' toc: yes
#' toc_float: yes
#' ---
#'+ setup, include=FALSE
knitr::opts_chunk$set(echo = TRUE)
library(httr)
library(read.dbc)
library(DT)
library(dygraphs)
library(knitr)
library(plotly)
library(data.table)
library(ggplot2)
theme_set(theme_bw())
#' Previously, the data was saved as RDS to reduce the weight (of data format)
#'
#' data <- fread("./data/ETLSIM.DORES_2010.csv"## )
#' saveRDS(data, "ETLSIM.DORES_2010.rds")
## Data for 2010
rm(list = ls())
#' 2021 databsae url = 'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SIM/DO21OPEN.csv'
url = 'https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral_2020.csv'
d = fread(url, nrows = 500) # Download all the data, but you could select columns and rows to read by 'fread' function
###################################################
## d <- readRDS("../data/ETLSIM.DORES_2010.rds") ##
## setDT(d) ##
###################################################
setnames(d, tolower)
names(d)
#' Selecting 10 colums
#'
cols = c('dtobito', 'dtnasc', 'sexo', 'idade_obito_anos',
'racacor', 'causabas_categoria', 'causabas_capitulo',
'res_sigla_uf', 'ocor_regiao')
d = d[, ..cols]
str(d)
#' ## Data óbito - Date of death
d[, .N, dtobito]
d[, `:=` (idtobito = as.IDate(as.character(dtobito), "%d%m%Y"),
idtnasc = as.IDate(as.character(dtnasc), "%d%m%Y"))]
d[, .N, idtobito]
d[, .N, year(idtobito)]
d[, .N, month(idtobito)]
d[, .N, mday(idtobito)]
testing = grep("(^20)-", d$idtobito, value = T) # Values with incomplete year
d[, idtobito := gsub("^20-", "2020-", idtobito) ] # Susbtitute abnormal values in year
d[, .N, year(idtnasc)]
d[, .N, month(idtnasc)]
d[, .N, mday(idtnasc)]
d[year(idtnasc) < 1900, .(idtnasc)]
testable = grep("^(9)", d$dtnasc, value = T) # abornormal values in year
table(testable)
d[, idtnasc := gsub("^(9)", "19", idtnasc) ] # Susbtitute abnormal values in year
d[, .N, year(idtnasc)]
d[, sum(is.na(idtobito))]
class(d$idtobito)
ggplot(d, aes(idtobito)) +
geom_histogram(bins = 100)
#' ## Data nascimento - Date of birth
d[, .N, dtnasc]
sum(is.na(d$dtnasc))
ggplot(d, aes(dtnasc)) +
geom_histogram(bins = 100)
#' ## Sexo - Sex
d[, .N, sexo]
ggplot(d, aes(factor(sexo), idade)) +
geom_boxplot()
#' ## Idade - Age
d[, .N, idade]
#' Missing data
d[, sum(is.na(idade))]
ggplot(d, aes(idade)) +
geom_histogram(bins = 200)
p1 = ggplot(d[sexo != 'Ignorado', ],
aes(idade, fill = sexo)) +
geom_histogram(bins = 200, alpha = 0.7) +
labs(fill = '') +
theme(legend.position = c(.9, .9)) +
facet_wrap(~ sexo, ncol = 1)
p1
p2 = ggplot(d[sexo != 'Ignorado', ],
aes(idade, fill = sexo)) +
geom_histogram(bins = 200, alpha = 0.7) +
labs(fill = '') +
theme(legend.position = c(.9, .9)) +
facet_grid(sexo ~ racacor)
p2
#' ## Def Raça cor - Ethnic social representation
#'
#' Cor informada pelo responsável pelas informações do falecido. (1 Branca; 2
#' Preta; 3 Amarela; 4 Parda; 5 Indígena)
d[, .N, racacor]
#' ## Causa básica - Cause of death
d[, .N, causabas]
d[, .N, causabas_o]
d[, .N, causabas_capitulo] # cid chapter
cap = d[sexo != 'Ignorado', .N, .(causabas_capitulo, sexo)]
names(cap)
count() %>%
mutate(Sexo = as.factor(sexo))
p3 = ggplot(cap, aes(reorder(causabas_capitulo, +N), N, fill = sexo)) +
geom_col() +
coord_flip() +
theme_bw() +
labs(y = "", x = "", fill = '') +
facet_wrap(~ sexo)
p3
#' ## Sexo
d[, .N, ocor_regiao]
## d %>%
## count(ocor_REGIAO) %>%
## ggplot(aes(reorder(ocor_REGIAO, +n), n)) +
## geom_col() +
## coord_flip()
#' ## Mortalidade por estado
d[, .N, res_sigla_uf]