Actualizar archivos y añadir imágenes

This commit is contained in:
Gregorio Reyes 2022-12-06 19:32:23 -06:00
parent 3451a55365
commit 5248ed6084
8 changed files with 137334 additions and 33 deletions

137041
data/preprocessed_dvgm.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
images/decomposition.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 168 KiB

BIN
images/forecast.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

BIN
images/prediccion.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

BIN
images/residuos.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

BIN
images/serie_de_tiempo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

View File

@ -1,43 +1,59 @@
import numpy as np
import pandas as pd
import utilerias as ut
def formatea_fecha(data):
data.loc[:, "fecha_completa"] = data.loc[:, "fecha"] + \
" " + data.loc[:, "hora_recepcion"]
data.loc[:, "fecha_completa"] = (
data.loc[:, "fecha"] + " " + data.loc[:, "hora_recepcion"]
)
data.loc[:, "fecha_completa"] = pd.to_datetime(
data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S")
data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
)
def limpia_descripcion_cierre(data):
renombre_de_valores = {"Cierre De Fuga": "Otros",
"Fuera De Entidad Federativa": "Otros",
"Fuera De Subzona": "Otros",
"Remitido Al M.p.": "Hecho real",
"Resolucion 1": "Hecho real",
"Resolucion 5": "Hecho real"}
data.loc[:, "descripcion_cierre"] = data.loc[:,
"descripcion_cierre"].replace(renombre_de_valores)
renombre_de_valores = {
"Cierre De Fuga": "Otros",
"Fuera De Entidad Federativa": "Otros",
"Fuera De Subzona": "Otros",
"Remitido Al M.p.": "Hecho real",
"Resolucion 1": "Hecho real",
"Resolucion 5": "Hecho real",
}
data.loc[:, "descripcion_cierre"] = data.loc[:, "descripcion_cierre"].replace(
renombre_de_valores
)
def limpia_via_recepcion(data):
renombre_de_valores = {"060 Monterrey": "Otros",
"80": "Otros",
"89": "Otros",
"App": "Otros",
"App 9-1-1": "Otros",
"Cic": "Otros",
"Rutina": "Gobierno",
"Sala crisis": "Gobierno",
"Telefonia movil": "Telefono",
"Videovigilancia": "Gobierno",
"Whats App": "Otros"}
data.loc[:, "via_recepcion"] = data.loc[:,
"via_recepcion"].replace(renombre_de_valores)
renombre_de_valores = {
"060 Monterrey": "Otros",
"80": "Otros",
"89": "Otros",
"App": "Otros",
"App 9-1-1": "Otros",
"Cic": "Otros",
"Rutina": "Gobierno",
"Sala crisis": "Gobierno",
"Telefonia movil": "Telefono",
"Videovigilancia": "Gobierno",
"Whats App": "Otros",
}
data.loc[:, "via_recepcion"] = data.loc[:, "via_recepcion"].replace(
renombre_de_valores
)
def guarda_en_csv(data):
nombre_de_archivo = "data/preprocessed_dvgm.csv"
data_path = ut.abs_path(nombre_de_archivo)
data.to_csv(data_path)
def run(data):
formatea_fecha(data)
limpia_descripcion_cierre(data)
limpia_via_recepcion(data)
guarda_en_csv(data)

View File

@ -1,5 +1,7 @@
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import preprocesar_datos as p_datos
import utilerias as ut
@ -7,27 +9,269 @@ import utilerias as ut
class SerieDeTiempo:
"""Esta clase gestiona la serie de tiempo."""
def __init__(self):
def __init__(self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False):
self.iniciar_variables(
variable=variable, freq=freq, alpha=alpha, steps=steps, savefig=savefig
)
self.carga_datos()
self.definir_variables()
self.preprocesar_datos()
self.plot_timeseries()
self.decompose_timeseries()
self.determine_arima_params()
self.fit_arima()
self.validate_forecast()
self.get_forecast()
def iniciar_variables(
self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False
):
"""This method saves init variables."""
self.variable = None
self.freq = freq
self.alpha = alpha
self.steps = steps
self._savefig = savefig
def carga_datos(self):
"""Carga los datos de los reportes al 911."""
try:
nombre_de_archivo = "data/preprocessed_dvgm.csv"
data_path = ut.abs_path(nombre_de_archivo)
self.data = pd.read_csv(data_path)
self.data.loc[:, "fecha_completa"] = pd.to_datetime(
self.data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
)
except:
nombre_de_archivo = "data/dvgm.csv"
data_path = ut.abs_path(nombre_de_archivo)
self.data = pd.read_csv(data_path)
self.preprocesar_datos()
nombre_de_archivo = "data/dvgm.csv"
data_path = ut.abs_path(nombre_de_archivo)
self.data = pd.read_csv(data_path)
def preprocesar_datos(self):
"""Método que preprocesa los datos que lo requieren."""
p_datos.run(self.data)
# print(self.data.groupby("via_recepcion").count())
def definir_variables(self):
"""Método que define variables necesarias respecto a los datos."""
self.atributos = self.data.columns.values
def preprocesar_datos(self):
"""Método que preprocesa los datos que lo requieren."""
p_datos.run(self.data)
print(self.data.groupby("via_recepcion").count())
if self.variable:
self.valores_de_variable = self.data.loc[:, self.variable].unique()
else:
self.valores_de_variable = ["General"]
self.generate_timeseries()
def generate_timeseries(self):
"""This method generates the timeseries based on the data."""
self.ts = {}
for valor_de_variable in self.valores_de_variable:
if self.variable:
ts = (
self.data.loc[self.data.loc[:, self.variable] == valor_de_variable]
.groupby(self.data.loc[:, "fecha_completa"].dt.date)
.count()
.loc[:, "numero_reporte"]
.asfreq("D")
)
else:
ts = (
self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
.count()
.loc[:, "numero_reporte"]
.asfreq("D")
)
date_min = ts.index.min()
date_max = ts.index.max()
idx = pd.date_range(date_min, date_max)
ts = ts.reindex(idx)
ts = ts.interpolate()
self.ts[valor_de_variable] = ts.copy()
def savefig(self, fig_name="images/untitled.png"):
"""This methos saves figures with a given name."""
fig_path = ut.abs_path(fig_name)
if self._savefig:
plt.savefig(fig_path)
else:
plt.show()
def plot_timeseries(self):
"""This method plots the timeseries for each country."""
for valor_de_variable in self.valores_de_variable:
self.ts[valor_de_variable].plot(figsize=(15, 5))
plt.title(
"Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
)
plt.xlabel("fecha")
plt.ylabel("# de reportes")
fig_name = "images/{}_timeseries.png".format(valor_de_variable)
self.savefig(fig_name)
def decompose_timeseries(self):
"""This method applies seasonal decomposition to timeseries."""
self.decompose = {}
for valor_de_variable in self.valores_de_variable:
decomposition = sm.tsa.seasonal_decompose(
self.ts[valor_de_variable], model="additive", period=self.freq
)
fig = decomposition.plot()
fig.set_size_inches((15, 7))
fig.suptitle(
"Reportes al 911 - Serie de Tiempo - {} decomposition".format(
valor_de_variable
)
)
fig_name = "images/{}_decomposition_{}.png".format(
valor_de_variable, self.freq
)
self.savefig(fig_name)
def grid_search(self, valor_de_variable, pdq, seasonal_pdq, display=False):
"""This method implements the grid search."""
best_aic = 99999.9
best_pdq = (0, 0, 0)
best_seasonal_pdq = (0, 0, 0, 0)
for order in pdq:
for seasonal_order in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(
self.ts[valor_de_variable],
order=order,
seasonal_order=seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False,
)
results = mod.fit(disp=False)
if display:
print(
"{} - ARIMA{}x{} - AIC: {:.2f}".format(
valor_de_variable, order, seasonal_order, results.aic
)
)
if results.aic < best_aic:
best_aic = results.aic
best_pdq = order
best_seasonal_pdq = seasonal_order
except:
continue
return (best_pdq, best_seasonal_pdq, best_aic)
def determine_arima_params(self):
"""This method determines ARIMA params based on a grid search."""
p = d = q = range(0, 2)
pdq = [(i, j, k) for i in p for j in d for k in q]
seasonal_pdq = [(x[0], x[1], x[2], self.freq) for x in pdq]
self.pdq = {}
self.seasonal_pdq = {}
self.aic = {}
for valor_de_variable in self.valores_de_variable:
print("\n\n------------------------------------")
print("{} ARIMA Grid Search ...".format(valor_de_variable))
print("------------------------------------\n\n")
(best_pdq, best_seasonal_pdq, best_aic) = self.grid_search(
valor_de_variable, pdq, seasonal_pdq
)
print("\nBest params for {} are:".format(valor_de_variable))
print(
"ARIMA{}x{} - AIC: {:.2f}".format(best_pdq, best_seasonal_pdq, best_aic)
)
self.pdq[valor_de_variable] = best_pdq
self.seasonal_pdq[valor_de_variable] = best_seasonal_pdq
self.aic[valor_de_variable] = best_aic
def fit_arima(self):
"""This method fits the ARIMA model based on best parameters."""
self.results = {}
for valor_de_variable in self.valores_de_variable:
mod = sm.tsa.statespace.SARIMAX(
self.ts[valor_de_variable],
order=self.pdq[valor_de_variable],
seasonal_order=self.seasonal_pdq[valor_de_variable],
enforce_stationarity=False,
enforce_invertibility=False,
)
results = mod.fit(disp=False)
print("\n\n------------------------------------")
print("{} Timeseries ARIMA results".format(valor_de_variable))
print("------------------------------------\n\n")
print(results.summary().tables[1])
results.plot_diagnostics(figsize=(15, 7))
fig_name = "images/{}_residuals_{}.png".format(valor_de_variable, self.freq)
self.savefig(fig_name)
self.results[valor_de_variable] = results
def validate_forecast(self):
"""This method validates the forecast and calculates RMSE."""
for valor_de_variable in self.valores_de_variable:
prediction = self.results[valor_de_variable].get_prediction(
start="2022-11-01", dynamic=False
)
confidence_interval = prediction.conf_int(alpha=self.alpha)
ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
prediction.predicted_mean.plot(
ax=ax, label="One-step ahead Forecast", alpha=0.7
)
ax.fill_between(
confidence_interval.index,
confidence_interval.iloc[:, 0],
confidence_interval.iloc[:, 1],
color="k",
alpha=0.2,
)
ax.set_title(
"Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
)
ax.set_xlabel("Fecha")
ax.set_ylabel("# de reportes")
plt.legend()
fig_name = "images/{}_prediction_{}.png".format(
valor_de_variable, self.freq
)
self.savefig(fig_name)
observed = self.ts[valor_de_variable].loc["2022-11-01":]
forecasted = prediction.predicted_mean
mse = ((forecasted - observed) ** 2).mean()
print("\n\n------------------------------------")
print("{} Timeseries ARIMA results".format(valor_de_variable))
print("------------------------------------\n\n")
print("The Mean Squared Error (MSE) of the forecast is {:.2f}".format(mse))
print(
"The Root Mean Squared Error (RMSE) of the forecast is {:.2f}".format(
np.sqrt(mse)
)
)
def get_forecast(self):
"""This method does the forecast."""
for valor_de_variable in self.valores_de_variable:
forecast = self.results[valor_de_variable].get_forecast(steps=self.steps)
confidence_interval = forecast.conf_int(alpha=self.alpha)
ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
forecast.predicted_mean.plot(ax=ax, label="Forecast")
ax.fill_between(
confidence_interval.index,
confidence_interval.iloc[:, 0],
confidence_interval.iloc[:, 1],
color="k",
alpha=0.2,
)
ax.set_title(
"Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
)
ax.set_xlabel("Fecha")
ax.set_ylabel("# de reportes")
plt.legend()
fig_name = "images/{}_forecast_{}.png".format(valor_de_variable, self.freq)
self.savefig(fig_name)
if __name__ == "__main__":