256 lines
10 KiB
Python
256 lines
10 KiB
Python
# Tratamiento de datos
|
|
# ==============================================================================
|
|
import numpy as np
|
|
import pandas as pd
|
|
from utils import fullRead, scaling
|
|
from sklearn.experimental import enable_iterative_imputer
|
|
from sklearn.impute import IterativeImputer
|
|
|
|
# Gráficos
|
|
# ==============================================================================
|
|
from plotting import plotTestVsPredicted, plotResiduals
|
|
|
|
# Modelado
|
|
# ==============================================================================
|
|
from xgboost import XGBRegressor
|
|
from sklearn.model_selection import train_test_split
|
|
from scripts.fit_models import XGBfit, XGBfit_GCV, LGBMfit
|
|
from RFfit import RFfit, RF_Fit_GCV, RFfit_lgbm
|
|
|
|
import optuna
|
|
import time
|
|
|
|
# Configuración warnings
|
|
# ==============================================================================
|
|
import warnings
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
|
|
import sys
|
|
|
|
paths = ["plasmFlav_ord.csv", "plasmAnt_ord.csv", "urineFlav_ord.csv", "urineAnt_ord.csv"]
|
|
paths = ["../data/" + s for s in paths]
|
|
|
|
for path in paths:
|
|
|
|
print("------- READING "+ paths[2]+ " -----------")
|
|
df, df_name = fullRead(paths[2], sep = ",", full = False)
|
|
|
|
full = True
|
|
|
|
if not full:
|
|
df[['Weight', 'BMI', "Fat", "CVRI", "Bpmin", "Bpmax", "Frec"]] = df[['Weight', 'BMI', "Fat", "CVRI", "Bpmin", "Bpmax", "Frec"]].apply(pd.to_numeric)
|
|
|
|
|
|
# Calculate the percentage of missing values in each column
|
|
missing_percentages = df.isnull().mean()
|
|
|
|
# Filter columns where missing percentage is greater than the threshold
|
|
columns_to_remove = missing_percentages[missing_percentages > 0.4].index
|
|
|
|
print("Columns removed by na's percentage:")
|
|
print(columns_to_remove)
|
|
|
|
# Drop the columns from the DataFrame
|
|
df_filtered = df.drop(columns=columns_to_remove)
|
|
|
|
numCols = df_filtered.select_dtypes(include=np.number).drop("numVol", axis=1).columns
|
|
|
|
df_filtered[numCols] = df_filtered[numCols].apply(lambda x: np.where(x > 2, np.nan, x))
|
|
|
|
df_filtered.replace([0,1,0.0,1.1], np.nan, inplace = True)
|
|
|
|
# df.dropna(inplace = True)
|
|
print ("----------- IMPUTING DATASET BY MICE ALGORITHM ------------------")
|
|
iimp = IterativeImputer(
|
|
estimator = XGBRegressor(),
|
|
random_state = 42,
|
|
verbose = 0,
|
|
)
|
|
|
|
iimp.set_output(transform="pandas")
|
|
iimp.fit(df)
|
|
iimp.transform(df[features])
|
|
df_categorical = df_filtered[['Sex', 'Sweetener', 'Time']]
|
|
df_imp = iimp.fit_transform(df_filtered.drop(['Sex', 'Sweetener', 'Time'],axis = 1))
|
|
df_filtered = pd.concat([df_imp, df_categorical], axis = 1)
|
|
|
|
df_filtered[['Sex', 'Sweetener']] = df_filtered[['Sex', 'Sweetener']].astype("category")
|
|
|
|
df_filtered.to_csv("../results/df_"+df_name+"_imputed.csv", sep = ";")
|
|
|
|
# df, scaler = scaling(df)
|
|
# df = scaling(df[df["numVol"].duplicated(keep=False)])
|
|
|
|
numCols = df_filtered.select_dtypes(include=np.number).drop("numVol", axis=1).columns
|
|
df_corr = df_filtered[numCols].corr(method="pearson")
|
|
|
|
print (df_corr)
|
|
|
|
for target in df_filtered.drop(["numVol","Time", "Sweetener", "Sex"], axis = 1).columns:
|
|
print("------------ MODELLING " + target+ " -----------")
|
|
df = df_filtered
|
|
|
|
threshold = 0.4
|
|
# Get absolute correlations with the target variable
|
|
target_correlation = df_corr[target]
|
|
print(target_correlation[target_correlation >= threshold])
|
|
# Select features highly correlated with the target
|
|
correlated_features = target_correlation[target_correlation >= threshold].index.tolist()
|
|
correlated_features.remove(target)
|
|
|
|
print("--------------- Removed features for "+target+": ---------------")
|
|
print(correlated_features)
|
|
df = df.drop(correlated_features, axis=1)
|
|
|
|
X = df[df["Time"] == "Initial"].set_index("numVol").drop(["Time", target], axis=1)
|
|
|
|
y = df[df["Time"] == "Final"].set_index("numVol")[target]
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X,
|
|
y,
|
|
train_size = 0.8,
|
|
random_state = 42,
|
|
shuffle = True
|
|
)
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train,
|
|
y_train,
|
|
train_size = 0.8,
|
|
random_state = 42,
|
|
shuffle = True
|
|
)
|
|
|
|
print("Observaciones en train:", X_train.shape)
|
|
print("Observaciones en validation:", X_val.shape)
|
|
print("Observaciones en test:", X_test.shape)
|
|
print(X.columns)
|
|
print(y.name)
|
|
|
|
RF = False
|
|
|
|
#X_train, scaler = scaling(X_train)
|
|
#y_train = scaler.transform(y_train)
|
|
|
|
if (RF):
|
|
print(" ----------------- STARTING RF ----------------- ")
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.multioutput import MultiOutputRegressor
|
|
modelname = "rf_lgbm_OneTarget_noScaling"
|
|
|
|
start = time.time()
|
|
|
|
#rf_XGB = RFfit(X_train, X_test, X_val, y_train, y_test, y_val)
|
|
rf_lgbm = RFfit_lgbm(X_train, X_test, X_val, y_train, y_test, y_val)
|
|
#rf_XGB = RF_Fit_GCV(X_train, X_test, X_val, y_train, y_test, y_val, df_name)
|
|
|
|
end = time.time()
|
|
|
|
tiempo_entrenamiento_RF = end - start
|
|
|
|
print(f"Tiempo optimización + entrenamiento RF: {tiempo_entrenamiento_RF:.2f} segundos")
|
|
|
|
# make predict
|
|
|
|
# y_pred_scaled = rf_XGB.predict(X_test)
|
|
# y_pred = scaler.inverse_transform(y_pred_scaled)
|
|
# y_test = pd.DataFrame([i for i in scaler.inverse_transform(y_test)], index=y_test.index, columns= y_test.columns)
|
|
|
|
y_test.name = target+"_test"
|
|
y_pred = pd.Series(rf_lgbm .predict(X_test), index = y_test.index, name = target+"_pred")
|
|
df_predTest = pd.concat([y_pred, y_test], axis = 1)
|
|
df_predTest.to_csv("../results/predicts/df_pred-test_"+modelname+"_"+target+".csv", sep = ";")
|
|
|
|
|
|
# df_predTest = pd.DataFrame([i for i in y_pred], index = y_test.index, columns= y_test.columns).add_suffix("_pred").join(y_test.add_suffix('_test'))
|
|
# df_predTest.reindex(sorted(df_predTest.columns), axis=1).to_csv("../results/predicts/df_pred-test_"+modelname+".csv", sep = ";")
|
|
|
|
# plotting
|
|
# metabs = y_test.columns.drop(list(y_test.filter(regex='Sex|Sweetener')))
|
|
|
|
|
|
plotTestVsPredicted (metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
|
|
plotResiduals(metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
|
|
|
|
|
|
XGB = True
|
|
if (XGB):
|
|
print(" ----------------- STARTING XGB ----------------- ")
|
|
|
|
|
|
modelname = "xgb"
|
|
|
|
start = time.time()
|
|
|
|
xgb_XGB = XGBfit(X_train, X_test, X_val, y_train, y_test, y_val)
|
|
|
|
end = time.time()
|
|
|
|
tiempo_entrenamiento_XGB = end - start
|
|
|
|
print(f"Tiempo optimización + entrenamiento XGB: {tiempo_entrenamiento_XGB:.2f} segundos")
|
|
|
|
|
|
# saving predicted
|
|
print(" ----------------- PREDICTING AND JOINING PRED + TEST ----------------- ")
|
|
|
|
|
|
# make predict
|
|
# y_pred_scaled = xgb_XGB.predict(X_test)
|
|
# y_pred = scaler.inverse_transform(y_pred_scaled)
|
|
# y_test = pd.DataFrame([i for i in scaler.inverse_transform(y_test)], index=y_test.index, columns= y_test.columns)
|
|
|
|
y_test.name = target+"_test"
|
|
y_pred = pd.Series(xgb_XGB.predict(X_test), index = y_test.index, name = target+"_pred")
|
|
df_predTest = pd.concat([y_pred, y_test], axis = 1)
|
|
df_predTest.to_csv("../results/predicts/df_pred-test_"+modelname+"_"+target+".csv", sep = ";")
|
|
#df_predTest = pd.DataFrame([i for i in y_pred], index = y_test.index, columns= y_test.columns).add_suffix("_pred").join(y_test.add_suffix('_test'))
|
|
#df_predTest.reindex(sorted(df_predTest.columns), axis=1).to_csv("../results/predicts/df_pred-test_"+modelname+".csv", sep = ";")
|
|
|
|
# plotting
|
|
#metabs = y_test.columns.drop(list(y_test.filter(regex='Sex|Sweetener')))
|
|
plotTestVsPredicted (metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
|
|
plotResiduals(metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
|
|
|
|
LGBM = True
|
|
|
|
if (LGBM):
|
|
print(" ----------------- STARTING LGBM ----------------- ")
|
|
|
|
modelname = "lgbm"
|
|
|
|
start = time.time()
|
|
|
|
lgbm_model = LGBMfit(X_train, X_test, X_val, y_train, y_test, y_val)
|
|
|
|
end = time.time()
|
|
|
|
tiempo_entrenamiento_RF = end - start
|
|
|
|
print(f"Tiempo optimización + entrenamiento RF: {tiempo_entrenamiento_RF:.2f} segundos")
|
|
|
|
# make predict
|
|
|
|
# y_pred_scaled = rf_XGB.predict(X_test)
|
|
# y_pred = scaler.inverse_transform(y_pred_scaled)
|
|
# y_test = pd.DataFrame([i for i in scaler.inverse_transform(y_test)], index=y_test.index, columns= y_test.columns)
|
|
|
|
y_test.name = target+"_test"
|
|
y_pred = pd.Series(lgbm_model.predict(X_test), index = y_test.index, name = target+"_pred")
|
|
df_predTest = pd.concat([y_pred, y_test], axis = 1)
|
|
df_predTest.to_csv("../results/predicts/df_pred-test_"+modelname+"_"+target+".csv", sep = ";")
|
|
|
|
|
|
# df_predTest = pd.DataFrame([i for i in y_pred], index = y_test.index, columns= y_test.columns).add_suffix("_pred").join(y_test.add_suffix('_test'))
|
|
# df_predTest.reindex(sorted(df_predTest.columns), axis=1).to_csv("../results/predicts/df_pred-test_"+modelname+".csv", sep = ";")
|
|
|
|
# plotting
|
|
# metabs = y_test.columns.drop(list(y_test.filter(regex='Sex|Sweetener')))
|
|
|
|
|
|
plotTestVsPredicted (metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
|
|
plotResiduals(metab=target, df_predTest=df_predTest, modelname=modelname, df_name=df_name)
|
|
|