modelSanoApp/scripts/pruebas_retrain.py
2024-05-21 09:59:31 +02:00

256 lines
10 KiB
Python

# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
from utils import fullRead, scaling
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Gráficos
# ==============================================================================
from plotting import plotTestVsPredicted, plotResiduals
# Modelado
# ==============================================================================
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from scripts.fit_models import XGBfit, XGBfit_GCV, LGBMfit
from RFfit import RFfit, RF_Fit_GCV, RFfit_lgbm
import optuna
import time
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)
import sys
paths = ["plasmFlav_ord.csv", "plasmAnt_ord.csv", "urineFlav_ord.csv", "urineAnt_ord.csv"]
paths = ["../data/" + s for s in paths]
for path in paths:
print("------- READING "+ paths[2]+ " -----------")
df, df_name = fullRead(paths[2], sep = ",", full = False)
full = True
if not full:
df[['Weight', 'BMI', "Fat", "CVRI", "Bpmin", "Bpmax", "Frec"]] = df[['Weight', 'BMI', "Fat", "CVRI", "Bpmin", "Bpmax", "Frec"]].apply(pd.to_numeric)
# Calculate the percentage of missing values in each column
missing_percentages = df.isnull().mean()
# Filter columns where missing percentage is greater than the threshold
columns_to_remove = missing_percentages[missing_percentages > 0.4].index
print("Columns removed by na's percentage:")
print(columns_to_remove)
# Drop the columns from the DataFrame
df_filtered = df.drop(columns=columns_to_remove)
numCols = df_filtered.select_dtypes(include=np.number).drop("numVol", axis=1).columns
df_filtered[numCols] = df_filtered[numCols].apply(lambda x: np.where(x > 2, np.nan, x))
df_filtered.replace([0,1,0.0,1.1], np.nan, inplace = True)
# df.dropna(inplace = True)
print ("----------- IMPUTING DATASET BY MICE ALGORITHM ------------------")
iimp = IterativeImputer(
estimator = XGBRegressor(),
random_state = 42,
verbose = 0,
)
iimp.set_output(transform="pandas")
iimp.fit(df)
iimp.transform(df[features])
df_categorical = df_filtered[['Sex', 'Sweetener', 'Time']]
df_imp = iimp.fit_transform(df_filtered.drop(['Sex', 'Sweetener', 'Time'],axis = 1))
df_filtered = pd.concat([df_imp, df_categorical], axis = 1)
df_filtered[['Sex', 'Sweetener']] = df_filtered[['Sex', 'Sweetener']].astype("category")
df_filtered.to_csv("../results/df_"+df_name+"_imputed.csv", sep = ";")
# df, scaler = scaling(df)
# df = scaling(df[df["numVol"].duplicated(keep=False)])
numCols = df_filtered.select_dtypes(include=np.number).drop("numVol", axis=1).columns
df_corr = df_filtered[numCols].corr(method="pearson")
print (df_corr)
for target in df_filtered.drop(["numVol","Time", "Sweetener", "Sex"], axis = 1).columns:
print("------------ MODELLING " + target+ " -----------")
df = df_filtered
threshold = 0.4
# Get absolute correlations with the target variable
target_correlation = df_corr[target]
print(target_correlation[target_correlation >= threshold])
# Select features highly correlated with the target
correlated_features = target_correlation[target_correlation >= threshold].index.tolist()
correlated_features.remove(target)
print("--------------- Removed features for "+target+": ---------------")
print(correlated_features)
df = df.drop(correlated_features, axis=1)
X = df[df["Time"] == "Initial"].set_index("numVol").drop(["Time", target], axis=1)
y = df[df["Time"] == "Final"].set_index("numVol")[target]
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
train_size = 0.8,
random_state = 42,
shuffle = True
)
X_train, X_val, y_train, y_val = train_test_split(
X_train,
y_train,
train_size = 0.8,
random_state = 42,
shuffle = True
)
print("Observaciones en train:", X_train.shape)
print("Observaciones en validation:", X_val.shape)
print("Observaciones en test:", X_test.shape)
print(X.columns)
print(y.name)
RF = False
#X_train, scaler = scaling(X_train)
#y_train = scaler.transform(y_train)
if (RF):
print(" ----------------- STARTING RF ----------------- ")
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
modelname = "rf_lgbm_OneTarget_noScaling"
start = time.time()
#rf_XGB = RFfit(X_train, X_test, X_val, y_train, y_test, y_val)
rf_lgbm = RFfit_lgbm(X_train, X_test, X_val, y_train, y_test, y_val)
#rf_XGB = RF_Fit_GCV(X_train, X_test, X_val, y_train, y_test, y_val, df_name)
end = time.time()
tiempo_entrenamiento_RF = end - start
print(f"Tiempo optimización + entrenamiento RF: {tiempo_entrenamiento_RF:.2f} segundos")
# make predict
# y_pred_scaled = rf_XGB.predict(X_test)
# y_pred = scaler.inverse_transform(y_pred_scaled)
# y_test = pd.DataFrame([i for i in scaler.inverse_transform(y_test)], index=y_test.index, columns= y_test.columns)
y_test.name = target+"_test"
y_pred = pd.Series(rf_lgbm .predict(X_test), index = y_test.index, name = target+"_pred")
df_predTest = pd.concat([y_pred, y_test], axis = 1)
df_predTest.to_csv("../results/predicts/df_pred-test_"+modelname+"_"+target+".csv", sep = ";")
# df_predTest = pd.DataFrame([i for i in y_pred], index = y_test.index, columns= y_test.columns).add_suffix("_pred").join(y_test.add_suffix('_test'))
# df_predTest.reindex(sorted(df_predTest.columns), axis=1).to_csv("../results/predicts/df_pred-test_"+modelname+".csv", sep = ";")
# plotting
# metabs = y_test.columns.drop(list(y_test.filter(regex='Sex|Sweetener')))
plotTestVsPredicted (metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
plotResiduals(metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
XGB = True
if (XGB):
print(" ----------------- STARTING XGB ----------------- ")
modelname = "xgb"
start = time.time()
xgb_XGB = XGBfit(X_train, X_test, X_val, y_train, y_test, y_val)
end = time.time()
tiempo_entrenamiento_XGB = end - start
print(f"Tiempo optimización + entrenamiento XGB: {tiempo_entrenamiento_XGB:.2f} segundos")
# saving predicted
print(" ----------------- PREDICTING AND JOINING PRED + TEST ----------------- ")
# make predict
# y_pred_scaled = xgb_XGB.predict(X_test)
# y_pred = scaler.inverse_transform(y_pred_scaled)
# y_test = pd.DataFrame([i for i in scaler.inverse_transform(y_test)], index=y_test.index, columns= y_test.columns)
y_test.name = target+"_test"
y_pred = pd.Series(xgb_XGB.predict(X_test), index = y_test.index, name = target+"_pred")
df_predTest = pd.concat([y_pred, y_test], axis = 1)
df_predTest.to_csv("../results/predicts/df_pred-test_"+modelname+"_"+target+".csv", sep = ";")
#df_predTest = pd.DataFrame([i for i in y_pred], index = y_test.index, columns= y_test.columns).add_suffix("_pred").join(y_test.add_suffix('_test'))
#df_predTest.reindex(sorted(df_predTest.columns), axis=1).to_csv("../results/predicts/df_pred-test_"+modelname+".csv", sep = ";")
# plotting
#metabs = y_test.columns.drop(list(y_test.filter(regex='Sex|Sweetener')))
plotTestVsPredicted (metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
plotResiduals(metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
LGBM = True
if (LGBM):
print(" ----------------- STARTING LGBM ----------------- ")
modelname = "lgbm"
start = time.time()
lgbm_model = LGBMfit(X_train, X_test, X_val, y_train, y_test, y_val)
end = time.time()
tiempo_entrenamiento_RF = end - start
print(f"Tiempo optimización + entrenamiento RF: {tiempo_entrenamiento_RF:.2f} segundos")
# make predict
# y_pred_scaled = rf_XGB.predict(X_test)
# y_pred = scaler.inverse_transform(y_pred_scaled)
# y_test = pd.DataFrame([i for i in scaler.inverse_transform(y_test)], index=y_test.index, columns= y_test.columns)
y_test.name = target+"_test"
y_pred = pd.Series(lgbm_model.predict(X_test), index = y_test.index, name = target+"_pred")
df_predTest = pd.concat([y_pred, y_test], axis = 1)
df_predTest.to_csv("../results/predicts/df_pred-test_"+modelname+"_"+target+".csv", sep = ";")
# df_predTest = pd.DataFrame([i for i in y_pred], index = y_test.index, columns= y_test.columns).add_suffix("_pred").join(y_test.add_suffix('_test'))
# df_predTest.reindex(sorted(df_predTest.columns), axis=1).to_csv("../results/predicts/df_pred-test_"+modelname+".csv", sep = ";")
# plotting
# metabs = y_test.columns.drop(list(y_test.filter(regex='Sex|Sweetener')))
plotTestVsPredicted (metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
plotResiduals(metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)