Actualizar archivos y añadir imágenes

2022-12-06 19:32:23 -06:00 · 2022-12-06 19:32:23 -06:00 · 5248ed6084
commit 5248ed6084
parent 3451a55365
8 changed files with 137334 additions and 33 deletions
--- a/data/preprocessed_dvgm.csv
+++ b/data/preprocessed_dvgm.csv
--- a/images/decomposition.png
+++ b/images/decomposition.png
--- a/images/forecast.png
+++ b/images/forecast.png
--- a/images/prediccion.png
+++ b/images/prediccion.png
--- a/images/residuos.png
+++ b/images/residuos.png
--- a/images/serie_de_tiempo.png
+++ b/images/serie_de_tiempo.png
--- a/src/preprocesar_datos.py
+++ b/src/preprocesar_datos.py
@ -1,43 +1,59 @@
 import numpy as np
 import pandas as pd
 import utilerias as ut
 def formatea_fecha(data):
-    data.loc[:, "fecha_completa"] = data.loc[:, "fecha"] + \
+    data.loc[:, "fecha_completa"] = (
-        " " + data.loc[:, "hora_recepcion"]
+        data.loc[:, "fecha"] + " " + data.loc[:, "hora_recepcion"]
    )
    data.loc[:, "fecha_completa"] = pd.to_datetime(
-        data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S")
+        data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
    )
 def limpia_descripcion_cierre(data):
-    renombre_de_valores = {"Cierre De Fuga": "Otros",
+    renombre_de_valores = {
-                           "Fuera De Entidad Federativa": "Otros",
+        "Cierre De Fuga": "Otros",
-                           "Fuera De Subzona": "Otros",
+        "Fuera De Entidad Federativa": "Otros",
-                           "Remitido Al M.p.": "Hecho real",
+        "Fuera De Subzona": "Otros",
-                           "Resolucion 1": "Hecho real",
+        "Remitido Al M.p.": "Hecho real",
-                           "Resolucion 5": "Hecho real"}
+        "Resolucion 1": "Hecho real",
-    data.loc[:, "descripcion_cierre"] = data.loc[:,
+        "Resolucion 5": "Hecho real",
-                                                 "descripcion_cierre"].replace(renombre_de_valores)
+    }
    data.loc[:, "descripcion_cierre"] = data.loc[:, "descripcion_cierre"].replace(
        renombre_de_valores
    )
 def limpia_via_recepcion(data):
-    renombre_de_valores = {"060 Monterrey": "Otros",
+    renombre_de_valores = {
-                           "80": "Otros",
+        "060 Monterrey": "Otros",
-                           "89": "Otros",
+        "80": "Otros",
-                           "App": "Otros",
+        "89": "Otros",
-                           "App 9-1-1": "Otros",
+        "App": "Otros",
-                           "Cic": "Otros",
+        "App 9-1-1": "Otros",
-                           "Rutina": "Gobierno",
+        "Cic": "Otros",
-                           "Sala crisis": "Gobierno",
+        "Rutina": "Gobierno",
-                           "Telefonia movil": "Telefono",
+        "Sala crisis": "Gobierno",
-                           "Videovigilancia": "Gobierno",
+        "Telefonia movil": "Telefono",
-                           "Whats App": "Otros"}
+        "Videovigilancia": "Gobierno",
-    data.loc[:, "via_recepcion"] = data.loc[:,
+        "Whats App": "Otros",
-                                            "via_recepcion"].replace(renombre_de_valores)
+    }
    data.loc[:, "via_recepcion"] = data.loc[:, "via_recepcion"].replace(
        renombre_de_valores
    )
 def guarda_en_csv(data):
    nombre_de_archivo = "data/preprocessed_dvgm.csv"
    data_path = ut.abs_path(nombre_de_archivo)
    data.to_csv(data_path)
 def run(data):
    formatea_fecha(data)
    limpia_descripcion_cierre(data)
    limpia_via_recepcion(data)
    guarda_en_csv(data)
--- a/src/serie_de_tiempo.py
+++ b/src/serie_de_tiempo.py
@ -1,5 +1,7 @@
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
 import matplotlib.pyplot as plt
 import preprocesar_datos as p_datos
 import utilerias as ut
@ -7,27 +9,269 @@ import utilerias as ut
 class SerieDeTiempo:
    """Esta clase gestiona la serie de tiempo."""
-    def __init__(self):
+    def __init__(self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False):
        self.iniciar_variables(
            variable=variable, freq=freq, alpha=alpha, steps=steps, savefig=savefig
        )
        self.carga_datos()
        self.definir_variables()
-        self.preprocesar_datos()
+        self.plot_timeseries()
        self.decompose_timeseries()
        self.determine_arima_params()
        self.fit_arima()
        self.validate_forecast()
        self.get_forecast()
    def iniciar_variables(
        self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False
    ):
        """This method saves init variables."""
        self.variable = None
        self.freq = freq
        self.alpha = alpha
        self.steps = steps
        self._savefig = savefig
    def carga_datos(self):
        """Carga los datos de los reportes al 911."""
        try:
            nombre_de_archivo = "data/preprocessed_dvgm.csv"
            data_path = ut.abs_path(nombre_de_archivo)
            self.data = pd.read_csv(data_path)
            self.data.loc[:, "fecha_completa"] = pd.to_datetime(
                self.data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
            )
        except:
            nombre_de_archivo = "data/dvgm.csv"
            data_path = ut.abs_path(nombre_de_archivo)
            self.data = pd.read_csv(data_path)
            self.preprocesar_datos()
-        nombre_de_archivo = "data/dvgm.csv"
+    def preprocesar_datos(self):
-        data_path = ut.abs_path(nombre_de_archivo)
+        """Método que preprocesa los datos que lo requieren."""
-        self.data = pd.read_csv(data_path)
+        p_datos.run(self.data)
        # print(self.data.groupby("via_recepcion").count())
    def definir_variables(self):
        """Método que define variables necesarias respecto a los datos."""
        self.atributos = self.data.columns.values
-    def preprocesar_datos(self):
+        if self.variable:
-        """Método que preprocesa los datos que lo requieren."""
+            self.valores_de_variable = self.data.loc[:, self.variable].unique()
-        p_datos.run(self.data)
+        else:
-        print(self.data.groupby("via_recepcion").count())
+            self.valores_de_variable = ["General"]
        self.generate_timeseries()
    def generate_timeseries(self):
        """This method generates the timeseries based on the data."""
        self.ts = {}
        for valor_de_variable in self.valores_de_variable:
            if self.variable:
                ts = (
                    self.data.loc[self.data.loc[:, self.variable] == valor_de_variable]
                    .groupby(self.data.loc[:, "fecha_completa"].dt.date)
                    .count()
                    .loc[:, "numero_reporte"]
                    .asfreq("D")
                )
            else:
                ts = (
                    self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
                    .count()
                    .loc[:, "numero_reporte"]
                    .asfreq("D")
                )
            date_min = ts.index.min()
            date_max = ts.index.max()
            idx = pd.date_range(date_min, date_max)
            ts = ts.reindex(idx)
            ts = ts.interpolate()
            self.ts[valor_de_variable] = ts.copy()
    def savefig(self, fig_name="images/untitled.png"):
        """This methos saves figures with a given name."""
        fig_path = ut.abs_path(fig_name)
        if self._savefig:
            plt.savefig(fig_path)
        else:
            plt.show()
    def plot_timeseries(self):
        """This method plots the timeseries for each country."""
        for valor_de_variable in self.valores_de_variable:
            self.ts[valor_de_variable].plot(figsize=(15, 5))
            plt.title(
                "Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
            )
            plt.xlabel("fecha")
            plt.ylabel("# de reportes")
            fig_name = "images/{}_timeseries.png".format(valor_de_variable)
            self.savefig(fig_name)
    def decompose_timeseries(self):
        """This method applies seasonal decomposition to timeseries."""
        self.decompose = {}
        for valor_de_variable in self.valores_de_variable:
            decomposition = sm.tsa.seasonal_decompose(
                self.ts[valor_de_variable], model="additive", period=self.freq
            )
            fig = decomposition.plot()
            fig.set_size_inches((15, 7))
            fig.suptitle(
                "Reportes al 911 - Serie de Tiempo - {} decomposition".format(
                    valor_de_variable
                )
            )
            fig_name = "images/{}_decomposition_{}.png".format(
                valor_de_variable, self.freq
            )
            self.savefig(fig_name)
    def grid_search(self, valor_de_variable, pdq, seasonal_pdq, display=False):
        """This method implements the grid search."""
        best_aic = 99999.9
        best_pdq = (0, 0, 0)
        best_seasonal_pdq = (0, 0, 0, 0)
        for order in pdq:
            for seasonal_order in seasonal_pdq:
                try:
                    mod = sm.tsa.statespace.SARIMAX(
                        self.ts[valor_de_variable],
                        order=order,
                        seasonal_order=seasonal_order,
                        enforce_stationarity=False,
                        enforce_invertibility=False,
                    )
                    results = mod.fit(disp=False)
                    if display:
                        print(
                            "{} - ARIMA{}x{} - AIC: {:.2f}".format(
                                valor_de_variable, order, seasonal_order, results.aic
                            )
                        )
                    if results.aic < best_aic:
                        best_aic = results.aic
                        best_pdq = order
                        best_seasonal_pdq = seasonal_order
                except:
                    continue
        return (best_pdq, best_seasonal_pdq, best_aic)
    def determine_arima_params(self):
        """This method determines ARIMA params based on a grid search."""
        p = d = q = range(0, 2)
        pdq = [(i, j, k) for i in p for j in d for k in q]
        seasonal_pdq = [(x[0], x[1], x[2], self.freq) for x in pdq]
        self.pdq = {}
        self.seasonal_pdq = {}
        self.aic = {}
        for valor_de_variable in self.valores_de_variable:
            print("\n\n------------------------------------")
            print("{} ARIMA Grid Search ...".format(valor_de_variable))
            print("------------------------------------\n\n")
            (best_pdq, best_seasonal_pdq, best_aic) = self.grid_search(
                valor_de_variable, pdq, seasonal_pdq
            )
            print("\nBest params for {} are:".format(valor_de_variable))
            print(
                "ARIMA{}x{} - AIC: {:.2f}".format(best_pdq, best_seasonal_pdq, best_aic)
            )
            self.pdq[valor_de_variable] = best_pdq
            self.seasonal_pdq[valor_de_variable] = best_seasonal_pdq
            self.aic[valor_de_variable] = best_aic
    def fit_arima(self):
        """This method fits the ARIMA model based on best parameters."""
        self.results = {}
        for valor_de_variable in self.valores_de_variable:
            mod = sm.tsa.statespace.SARIMAX(
                self.ts[valor_de_variable],
                order=self.pdq[valor_de_variable],
                seasonal_order=self.seasonal_pdq[valor_de_variable],
                enforce_stationarity=False,
                enforce_invertibility=False,
            )
            results = mod.fit(disp=False)
            print("\n\n------------------------------------")
            print("{} Timeseries ARIMA results".format(valor_de_variable))
            print("------------------------------------\n\n")
            print(results.summary().tables[1])
            results.plot_diagnostics(figsize=(15, 7))
            fig_name = "images/{}_residuals_{}.png".format(valor_de_variable, self.freq)
            self.savefig(fig_name)
            self.results[valor_de_variable] = results
    def validate_forecast(self):
        """This method validates the forecast and calculates RMSE."""
        for valor_de_variable in self.valores_de_variable:
            prediction = self.results[valor_de_variable].get_prediction(
                start="2022-11-01", dynamic=False
            )
            confidence_interval = prediction.conf_int(alpha=self.alpha)
            ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
            prediction.predicted_mean.plot(
                ax=ax, label="One-step ahead Forecast", alpha=0.7
            )
            ax.fill_between(
                confidence_interval.index,
                confidence_interval.iloc[:, 0],
                confidence_interval.iloc[:, 1],
                color="k",
                alpha=0.2,
            )
            ax.set_title(
                "Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
            )
            ax.set_xlabel("Fecha")
            ax.set_ylabel("# de reportes")
            plt.legend()
            fig_name = "images/{}_prediction_{}.png".format(
                valor_de_variable, self.freq
            )
            self.savefig(fig_name)
            observed = self.ts[valor_de_variable].loc["2022-11-01":]
            forecasted = prediction.predicted_mean
            mse = ((forecasted - observed) ** 2).mean()
            print("\n\n------------------------------------")
            print("{} Timeseries ARIMA results".format(valor_de_variable))
            print("------------------------------------\n\n")
            print("The Mean Squared Error (MSE) of the forecast is {:.2f}".format(mse))
            print(
                "The Root Mean Squared Error (RMSE) of the forecast is {:.2f}".format(
                    np.sqrt(mse)
                )
            )
    def get_forecast(self):
        """This method does the forecast."""
        for valor_de_variable in self.valores_de_variable:
            forecast = self.results[valor_de_variable].get_forecast(steps=self.steps)
            confidence_interval = forecast.conf_int(alpha=self.alpha)
            ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
            forecast.predicted_mean.plot(ax=ax, label="Forecast")
            ax.fill_between(
                confidence_interval.index,
                confidence_interval.iloc[:, 0],
                confidence_interval.iloc[:, 1],
                color="k",
                alpha=0.2,
            )
            ax.set_title(
                "Reportes al 911 - Serie de Tiempo - {}".format(valor_de_variable)
            )
            ax.set_xlabel("Fecha")
            ax.set_ylabel("# de reportes")
            plt.legend()
            fig_name = "images/{}_forecast_{}.png".format(valor_de_variable, self.freq)
            self.savefig(fig_name)
 if __name__ == "__main__":