Compare commits

...

3 Commits

Author SHA1 Message Date
Gregorio Reyes 59262b15df Agrega imágenes 2022-12-07 23:01:09 -06:00
Gregorio Reyes 5e62f5d870 Actualiza preprocesamiento de datos 2022-12-07 23:00:26 -06:00
Gregorio Reyes 6952b59710 Actualiza serie_de_tiempo.py 2022-12-07 22:59:50 -06:00
18 changed files with 137169 additions and 137079 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 192 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 109 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 85 KiB

View File

@ -3,13 +3,38 @@ import pandas as pd
import utilerias as ut
def reformatea_fecha(data):
"""Corrige el error en el formato de la fecha."""
data.loc[:, "fecha"] = pd.to_datetime(data.loc[:, "fecha"], format="%d/%m/%y")
data_date_wrong = data.loc[(data.fecha.dt.day >= 1) & (data.fecha.dt.day <= 12)]
data_date_right = data.loc[data.fecha.dt.day >= 13]
data_date_wrong.loc[:, "fecha"] = data_date_wrong.loc[:, "fecha"].dt.date
data_date_right.loc[:, "fecha"] = data_date_right.loc[:, "fecha"].dt.date
data_date_wrong.loc[:, "fecha"] = pd.to_datetime(
data_date_wrong.loc[:, "fecha"], format="%Y-%d-%m"
)
data_date_right.loc[:, "fecha"] = pd.to_datetime(
data_date_right.loc[:, "fecha"], format="%Y-%m-%d"
)
new_data = pd.concat([data_date_wrong, data_date_right])
new_data.loc[:, "fecha"] = new_data.loc[:, "fecha"].dt.strftime(
date_format="%Y-%m-%d"
)
return new_data
def formatea_fecha(data):
"""Formatea para tener fecha completa con hora, minutos y segundos."""
data.loc[:, "fecha_completa"] = (
data.loc[:, "fecha"] + " " + data.loc[:, "hora_recepcion"]
)
data.loc[:, "fecha_completa"] = pd.to_datetime(
data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
data.loc[:, "fecha_completa"], format="%Y-%m-%d %H:%M:%S"
)
@ -53,7 +78,9 @@ def guarda_en_csv(data):
def run(data):
formatea_fecha(data)
limpia_descripcion_cierre(data)
limpia_via_recepcion(data)
guarda_en_csv(data)
reformatted_data = reformatea_fecha(data)
formatea_fecha(reformatted_data)
limpia_descripcion_cierre(reformatted_data)
limpia_via_recepcion(reformatted_data)
guarda_en_csv(reformatted_data)
return reformatted_data

View File

@ -9,9 +9,22 @@ import utilerias as ut
class SerieDeTiempo:
"""Esta clase gestiona la serie de tiempo."""
def __init__(self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False):
def __init__(
self,
variable="general",
ts_freq="weeks",
freq=52,
alpha=0.05,
steps=52,
savefig=True,
):
self.iniciar_variables(
variable=variable, freq=freq, alpha=alpha, steps=steps, savefig=savefig
variable=variable,
ts_freq=ts_freq,
freq=freq,
alpha=alpha,
steps=steps,
savefig=savefig,
)
self.carga_datos()
self.definir_variables()
@ -23,10 +36,17 @@ class SerieDeTiempo:
self.get_forecast()
def iniciar_variables(
self, variable=None, freq=30, alpha=0.05, steps=30, savefig=False
self,
variable="general",
ts_freq="days",
freq=30,
alpha=0.05,
steps=30,
savefig=False,
):
"""This method saves init variables."""
self.variable = None
self.variable = variable
self.ts_freq = ts_freq
self.freq = freq
self.alpha = alpha
self.steps = steps
@ -39,7 +59,7 @@ class SerieDeTiempo:
data_path = ut.abs_path(nombre_de_archivo)
self.data = pd.read_csv(data_path)
self.data.loc[:, "fecha_completa"] = pd.to_datetime(
self.data.loc[:, "fecha_completa"], format="%d/%m/%y %H:%M:%S"
self.data.loc[:, "fecha_completa"], format="%Y-%m-%d %H:%M:%S"
)
except:
nombre_de_archivo = "data/dvgm.csv"
@ -47,9 +67,11 @@ class SerieDeTiempo:
self.data = pd.read_csv(data_path)
self.preprocesar_datos()
self.data.sort_values(by="fecha_completa", inplace=True)
def preprocesar_datos(self):
"""Método que preprocesa los datos que lo requieren."""
p_datos.run(self.data)
self.data = p_datos.run(self.data)
# print(self.data.groupby("via_recepcion").count())
def definir_variables(self):
@ -57,17 +79,47 @@ class SerieDeTiempo:
self.atributos = self.data.columns.values
if self.variable:
self.valores_de_variable = self.data.loc[:, self.variable].unique()
if self.variable == "general":
self.valores_de_variable = ["general"]
else:
self.valores_de_variable = ["General"]
self.valores_de_variable = self.data.loc[:, self.variable].unique()
self.generate_timeseries()
def generate_timeseries(self):
"""This method generates the timeseries based on the data."""
self.ts = {}
for valor_de_variable in self.valores_de_variable:
if self.variable:
if self.variable == "general":
if self.ts_freq == "days":
ts = (
self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
.count()
.loc[:, "numero_reporte"]
.asfreq("D")
)
elif self.ts_freq == "weeks":
self.data.loc[:, "fecha_completa"] = self.data.loc[
:, "fecha_completa"
].astype("datetime64[W]")
print(self.data)
ts = (
self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
.count()
.loc[:, "numero_reporte"]
.asfreq("W-THU")
)
elif self.ts_freq == "months":
self.data.loc[:, "fecha_completa"] = self.data.loc[
:, "fecha_completa"
].astype("datetime64[M]")
ts = (
self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
.count()
.loc[:, "numero_reporte"]
.asfreq("MS")
)
print(ts)
else:
ts = (
self.data.loc[self.data.loc[:, self.variable] == valor_de_variable]
.groupby(self.data.loc[:, "fecha_completa"].dt.date)
@ -75,25 +127,26 @@ class SerieDeTiempo:
.loc[:, "numero_reporte"]
.asfreq("D")
)
else:
ts = (
self.data.groupby(self.data.loc[:, "fecha_completa"].dt.date)
.count()
.loc[:, "numero_reporte"]
.asfreq("D")
)
date_min = ts.index.min()
date_max = ts.index.max()
idx = pd.date_range(date_min, date_max)
ts = ts.reindex(idx)
ts = ts.interpolate()
print("Min and Max dates: {} - {}".format(date_min, date_max))
if self.ts_freq == "days":
idx = pd.date_range(date_min, date_max)
ts = ts.reindex(idx, fill_value=0)
# ts = ts.interpolate()
else:
pass
self.ts[valor_de_variable] = ts.copy()
def savefig(self, fig_name="images/untitled.png"):
def savefig(self, fig_name="images/{}_{}_{}_untitled.png"):
"""This methos saves figures with a given name."""
fig_path = ut.abs_path(fig_name)
fig_name_complete = fig_name.format(self.variable, self.ts_freq, self.freq)
fig_path = ut.abs_path(fig_name_complete)
if self._savefig:
plt.savefig(fig_path)
plt.close()
else:
plt.show()
@ -106,7 +159,7 @@ class SerieDeTiempo:
)
plt.xlabel("fecha")
plt.ylabel("# de reportes")
fig_name = "images/{}_timeseries.png".format(valor_de_variable)
fig_name = "images/{}_{}_{}_timeseries.png"
self.savefig(fig_name)
def decompose_timeseries(self):
@ -123,9 +176,7 @@ class SerieDeTiempo:
valor_de_variable
)
)
fig_name = "images/{}_decomposition_{}.png".format(
valor_de_variable, self.freq
)
fig_name = "images/{}_{}_{}_decomposition.png"
self.savefig(fig_name)
def grid_search(self, valor_de_variable, pdq, seasonal_pdq, display=False):
@ -204,21 +255,35 @@ class SerieDeTiempo:
print("------------------------------------\n\n")
print(results.summary().tables[1])
results.plot_diagnostics(figsize=(15, 7))
fig_name = "images/{}_residuals_{}.png".format(valor_de_variable, self.freq)
fig_name = "images/{}_{}_{}_residuals.png"
self.savefig(fig_name)
self.results[valor_de_variable] = results
def validate_forecast(self):
"""This method validates the forecast and calculates RMSE."""
for valor_de_variable in self.valores_de_variable:
prediction = self.results[valor_de_variable].get_prediction(
start="2022-11-01", dynamic=False
)
if self.ts_freq == "days":
prediction = self.results[valor_de_variable].get_prediction(
start="2022-09-01", dynamic=False
)
elif self.ts_freq == "weeks":
prediction = self.results[valor_de_variable].get_prediction(
start="2022-06-30", dynamic=False
)
elif self.ts_freq == "months":
prediction = self.results[valor_de_variable].get_prediction(
start="2022-01-01", dynamic=False
)
confidence_interval = prediction.conf_int(alpha=self.alpha)
ax = self.ts[valor_de_variable].plot(label="Observed", figsize=(15, 7))
prediction.predicted_mean.plot(
ax=ax, label="One-step ahead Forecast", alpha=0.7
)
print(confidence_interval)
print(confidence_interval.index)
print(confidence_interval.iloc[:, 0])
print(confidence_interval.iloc[:, 1])
ax.fill_between(
confidence_interval.index,
confidence_interval.iloc[:, 0],
@ -232,9 +297,7 @@ class SerieDeTiempo:
ax.set_xlabel("Fecha")
ax.set_ylabel("# de reportes")
plt.legend()
fig_name = "images/{}_prediction_{}.png".format(
valor_de_variable, self.freq
)
fig_name = "images/{}_{}_{}_prediction.png"
self.savefig(fig_name)
observed = self.ts[valor_de_variable].loc["2022-11-01":]
@ -270,7 +333,7 @@ class SerieDeTiempo:
ax.set_xlabel("Fecha")
ax.set_ylabel("# de reportes")
plt.legend()
fig_name = "images/{}_forecast_{}.png".format(valor_de_variable, self.freq)
fig_name = "images/{}_{}_{}_forecast.png"
self.savefig(fig_name)