279 lines
11 KiB
Python
279 lines
11 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import statsmodels.api as sm
|
|
import matplotlib.pyplot as plt
|
|
import preprocesar_datos as p_datos
|
|
import utilerias as ut
|
|
|
|
|
|
class SerieDeTiempo:
|
|
"""Esta clase gestiona la serie de tiempo."""
|
|
|
|
def __init__(self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False):
|
|
self.iniciar_variables(
|
|
variable=variable, freq=freq, alpha=alpha, steps=steps, savefig=savefig
|
|
)
|
|
self.carga_datos()
|
|
self.definir_variables()
|
|
self.plot_timeseries()
|
|
self.decompose_timeseries()
|
|
self.determine_arima_params()
|
|
self.fit_arima()
|
|
self.validate_forecast()
|
|
self.get_forecast()
|
|
|
|
def iniciar_variables(
|
|
self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False
|
|
):
|
|
"""This method saves init variables."""
|
|
self.variable = None
|
|
self.freq = freq
|
|
self.alpha = alpha
|
|
self.steps = steps
|
|
self._savefig = savefig
|
|
|
|
def carga_datos(self):
|
|
"""Carga los datos de los reportes al 911."""
|
|
try:
|
|
nombre_de_archivo = "data/preprocessed_dvgm.csv"
|
|
data_path = ut.abs_path(nombre_de_archivo)
|
|
self.data = pd.read_csv(data_path)
|
|
self.data.loc[:, "fecha_completa"] = pd.to_datetime(
|
|
self.data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
|
|
)
|
|
except:
|
|
nombre_de_archivo = "data/dvgm.csv"
|
|
data_path = ut.abs_path(nombre_de_archivo)
|
|
self.data = pd.read_csv(data_path)
|
|
self.preprocesar_datos()
|
|
|
|
def preprocesar_datos(self):
|
|
"""Método que preprocesa los datos que lo requieren."""
|
|
p_datos.run(self.data)
|
|
# print(self.data.groupby("via_recepcion").count())
|
|
|
|
def definir_variables(self):
|
|
"""Método que define variables necesarias respecto a los datos."""
|
|
|
|
self.atributos = self.data.columns.values
|
|
|
|
if self.variable:
|
|
self.valores_de_variable = self.data.loc[:, self.variable].unique()
|
|
else:
|
|
self.valores_de_variable = ["General"]
|
|
self.generate_timeseries()
|
|
|
|
def generate_timeseries(self):
|
|
"""This method generates the timeseries based on the data."""
|
|
self.ts = {}
|
|
for valor_de_variable in self.valores_de_variable:
|
|
if self.variable:
|
|
ts = (
|
|
self.data.loc[self.data.loc[:, self.variable] == valor_de_variable]
|
|
.groupby(self.data.loc[:, "fecha_completa"].dt.date)
|
|
.count()
|
|
.loc[:, "numero_reporte"]
|
|
.asfreq("D")
|
|
)
|
|
else:
|
|
ts = (
|
|
self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
|
|
.count()
|
|
.loc[:, "numero_reporte"]
|
|
.asfreq("D")
|
|
)
|
|
date_min = ts.index.min()
|
|
date_max = ts.index.max()
|
|
idx = pd.date_range(date_min, date_max)
|
|
ts = ts.reindex(idx)
|
|
ts = ts.interpolate()
|
|
self.ts[valor_de_variable] = ts.copy()
|
|
|
|
def savefig(self, fig_name="images/untitled.png"):
|
|
"""This methos saves figures with a given name."""
|
|
fig_path = ut.abs_path(fig_name)
|
|
if self._savefig:
|
|
plt.savefig(fig_path)
|
|
else:
|
|
plt.show()
|
|
|
|
def plot_timeseries(self):
|
|
"""This method plots the timeseries for each country."""
|
|
for valor_de_variable in self.valores_de_variable:
|
|
self.ts[valor_de_variable].plot(figsize=(15, 5))
|
|
plt.title(
|
|
"Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
|
|
)
|
|
plt.xlabel("fecha")
|
|
plt.ylabel("# de reportes")
|
|
fig_name = "images/{}_timeseries.png".format(valor_de_variable)
|
|
self.savefig(fig_name)
|
|
|
|
def decompose_timeseries(self):
|
|
"""This method applies seasonal decomposition to timeseries."""
|
|
self.decompose = {}
|
|
for valor_de_variable in self.valores_de_variable:
|
|
decomposition = sm.tsa.seasonal_decompose(
|
|
self.ts[valor_de_variable], model="additive", period=self.freq
|
|
)
|
|
fig = decomposition.plot()
|
|
fig.set_size_inches((15, 7))
|
|
fig.suptitle(
|
|
"Reportes al 911 - Serie de Tiempo - {} decomposition".format(
|
|
valor_de_variable
|
|
)
|
|
)
|
|
fig_name = "images/{}_decomposition_{}.png".format(
|
|
valor_de_variable, self.freq
|
|
)
|
|
self.savefig(fig_name)
|
|
|
|
def grid_search(self, valor_de_variable, pdq, seasonal_pdq, display=False):
|
|
"""This method implements the grid search."""
|
|
best_aic = 99999.9
|
|
best_pdq = (0, 0, 0)
|
|
best_seasonal_pdq = (0, 0, 0, 0)
|
|
for order in pdq:
|
|
for seasonal_order in seasonal_pdq:
|
|
try:
|
|
mod = sm.tsa.statespace.SARIMAX(
|
|
self.ts[valor_de_variable],
|
|
order=order,
|
|
seasonal_order=seasonal_order,
|
|
enforce_stationarity=False,
|
|
enforce_invertibility=False,
|
|
)
|
|
results = mod.fit(disp=False)
|
|
|
|
if display:
|
|
print(
|
|
"{} - ARIMA{}x{} - AIC: {:.2f}".format(
|
|
valor_de_variable, order, seasonal_order, results.aic
|
|
)
|
|
)
|
|
|
|
if results.aic < best_aic:
|
|
best_aic = results.aic
|
|
best_pdq = order
|
|
best_seasonal_pdq = seasonal_order
|
|
except:
|
|
continue
|
|
return (best_pdq, best_seasonal_pdq, best_aic)
|
|
|
|
def determine_arima_params(self):
|
|
"""This method determines ARIMA params based on a grid search."""
|
|
p = d = q = range(0, 2)
|
|
pdq = [(i, j, k) for i in p for j in d for k in q]
|
|
seasonal_pdq = [(x[0], x[1], x[2], self.freq) for x in pdq]
|
|
|
|
self.pdq = {}
|
|
self.seasonal_pdq = {}
|
|
self.aic = {}
|
|
|
|
for valor_de_variable in self.valores_de_variable:
|
|
print("\n\n------------------------------------")
|
|
print("{} ARIMA Grid Search ...".format(valor_de_variable))
|
|
print("------------------------------------\n\n")
|
|
|
|
(best_pdq, best_seasonal_pdq, best_aic) = self.grid_search(
|
|
valor_de_variable, pdq, seasonal_pdq
|
|
)
|
|
|
|
print("\nBest params for {} are:".format(valor_de_variable))
|
|
print(
|
|
"ARIMA{}x{} - AIC: {:.2f}".format(best_pdq, best_seasonal_pdq, best_aic)
|
|
)
|
|
self.pdq[valor_de_variable] = best_pdq
|
|
self.seasonal_pdq[valor_de_variable] = best_seasonal_pdq
|
|
self.aic[valor_de_variable] = best_aic
|
|
|
|
def fit_arima(self):
|
|
"""This method fits the ARIMA model based on best parameters."""
|
|
self.results = {}
|
|
for valor_de_variable in self.valores_de_variable:
|
|
mod = sm.tsa.statespace.SARIMAX(
|
|
self.ts[valor_de_variable],
|
|
order=self.pdq[valor_de_variable],
|
|
seasonal_order=self.seasonal_pdq[valor_de_variable],
|
|
enforce_stationarity=False,
|
|
enforce_invertibility=False,
|
|
)
|
|
results = mod.fit(disp=False)
|
|
print("\n\n------------------------------------")
|
|
print("{} Timeseries ARIMA results".format(valor_de_variable))
|
|
print("------------------------------------\n\n")
|
|
print(results.summary().tables[1])
|
|
results.plot_diagnostics(figsize=(15, 7))
|
|
fig_name = "images/{}_residuals_{}.png".format(valor_de_variable, self.freq)
|
|
self.savefig(fig_name)
|
|
self.results[valor_de_variable] = results
|
|
|
|
def validate_forecast(self):
|
|
"""This method validates the forecast and calculates RMSE."""
|
|
for valor_de_variable in self.valores_de_variable:
|
|
prediction = self.results[valor_de_variable].get_prediction(
|
|
start="2022-11-01", dynamic=False
|
|
)
|
|
confidence_interval = prediction.conf_int(alpha=self.alpha)
|
|
ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
|
|
prediction.predicted_mean.plot(
|
|
ax=ax, label="One-step ahead Forecast", alpha=0.7
|
|
)
|
|
ax.fill_between(
|
|
confidence_interval.index,
|
|
confidence_interval.iloc[:, 0],
|
|
confidence_interval.iloc[:, 1],
|
|
color="k",
|
|
alpha=0.2,
|
|
)
|
|
ax.set_title(
|
|
"Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
|
|
)
|
|
ax.set_xlabel("Fecha")
|
|
ax.set_ylabel("# de reportes")
|
|
plt.legend()
|
|
fig_name = "images/{}_prediction_{}.png".format(
|
|
valor_de_variable, self.freq
|
|
)
|
|
self.savefig(fig_name)
|
|
|
|
observed = self.ts[valor_de_variable].loc["2022-11-01":]
|
|
forecasted = prediction.predicted_mean
|
|
mse = ((forecasted - observed) ** 2).mean()
|
|
print("\n\n------------------------------------")
|
|
print("{} Timeseries ARIMA results".format(valor_de_variable))
|
|
print("------------------------------------\n\n")
|
|
print("The Mean Squared Error (MSE) of the forecast is {:.2f}".format(mse))
|
|
print(
|
|
"The Root Mean Squared Error (RMSE) of the forecast is {:.2f}".format(
|
|
np.sqrt(mse)
|
|
)
|
|
)
|
|
|
|
def get_forecast(self):
|
|
"""This method does the forecast."""
|
|
for valor_de_variable in self.valores_de_variable:
|
|
forecast = self.results[valor_de_variable].get_forecast(steps=self.steps)
|
|
confidence_interval = forecast.conf_int(alpha=self.alpha)
|
|
ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
|
|
forecast.predicted_mean.plot(ax=ax, label="Forecast")
|
|
ax.fill_between(
|
|
confidence_interval.index,
|
|
confidence_interval.iloc[:, 0],
|
|
confidence_interval.iloc[:, 1],
|
|
color="k",
|
|
alpha=0.2,
|
|
)
|
|
ax.set_title(
|
|
"Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
|
|
)
|
|
ax.set_xlabel("Fecha")
|
|
ax.set_ylabel("# de reportes")
|
|
plt.legend()
|
|
fig_name = "images/{}_forecast_{}.png".format(valor_de_variable, self.freq)
|
|
self.savefig(fig_name)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
st = SerieDeTiempo()
|