blablabla
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
Before Width: | Height: | Size: 30 KiB After Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 28 KiB After Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 28 KiB After Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 28 KiB After Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 31 KiB After Width: | Height: | Size: 31 KiB |
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 45 KiB |
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 43 KiB |
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 44 KiB |
Before Width: | Height: | Size: 41 KiB After Width: | Height: | Size: 41 KiB |
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 44 KiB |
BIN
scripts/__pycache__/fit_models_CV.cpython-311.pyc
Normal file
|
@ -1,13 +1,50 @@
|
|||
from xgboost import XGBRegressor, XGBRFRegressor
|
||||
from lightgbm import LGBMRegressor
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedKFold
|
||||
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
||||
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedKFold, GridSearchCV
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import optuna
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
|
||||
def LGBM_GCV(X_train, X_test, y_train, y_test):
|
||||
|
||||
params = {
|
||||
'n_estimators': [100, 500, 1000],
|
||||
'max_depth': [1, 5, 7],#[3, 5, 7, 12],
|
||||
'learning_rate': [0.0001, 0.1], #[0.000001, 0.001, 0.01, 0.1],
|
||||
'colsample_bynode': [0.01, 1], #[0.001, 0.1, 1],
|
||||
'subsample': [0.01, 0.1, 1], # [0.001, 0.1, 1],
|
||||
'boosting_type': ['gbdt', 'dart'],
|
||||
'num_leaves': [1024, 2048]
|
||||
}
|
||||
|
||||
model = LGBMRegressor(
|
||||
n_jobs = -1,
|
||||
random_state = 42,
|
||||
verbose = 0
|
||||
)
|
||||
|
||||
print(" ----------------- SETTING UP TRAINING ----------------- ")
|
||||
|
||||
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
|
||||
|
||||
grid_search = GridSearchCV(estimator = model, param_grid = params, cv = cv, n_jobs = -1, verbose = 0)
|
||||
|
||||
print(" ----------------- STARTING GRIDSEARCH ----------------- ")
|
||||
|
||||
grid_search.fit(X_train, y_train)
|
||||
|
||||
cv_scores = cross_validate(grid_search.best_estimator_,
|
||||
x = X_test, y = y_test, cv = cv,
|
||||
scoring = ("r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"),
|
||||
return_estimator = True,
|
||||
return_indices = True)
|
||||
|
||||
return (cv_scores)
|
||||
|
||||
def LGBMfit_CV(X_train, X_test, y_train, y_test):
|
||||
|
||||
# Búsqueda bayesiana de hiperparámetros con optuna
|
||||
|
@ -30,7 +67,7 @@ def LGBMfit_CV(X_train, X_test, y_train, y_test):
|
|||
)
|
||||
model.fit(X_train, y_train)
|
||||
predictions = model.predict(X_test)
|
||||
score = np.abs(r2_score(y_test, predictions))
|
||||
score = r2_score(y_test, predictions)
|
||||
return score
|
||||
|
||||
study = optuna.create_study(direction='maximize')
|
||||
|
@ -49,7 +86,7 @@ def LGBMfit_CV(X_train, X_test, y_train, y_test):
|
|||
)
|
||||
|
||||
# Entrenamiento & scores modelo
|
||||
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
|
||||
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
|
||||
#lightGMB_model.fit(X_train, y_train)
|
||||
cv_scores = cross_validate(lightGMB_model,
|
||||
pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv = cv,
|
||||
|
@ -63,33 +100,7 @@ def LGBMfit_CV(X_train, X_test, y_train, y_test):
|
|||
|
||||
return cv_scores
|
||||
|
||||
### test area
|
||||
from utils import preproc_2
|
||||
import os
|
||||
main_df = pd.read_csv(os.path.join("..", "data", "final_merge.csv"), sep =";", decimal= ".")
|
||||
main_df[['Sex', 'Sweetener']] = main_df[['Sex', 'Sweetener']].astype("category")
|
||||
|
||||
study_features = ["Sex", "Sweetener"]
|
||||
targets = ['HE_post']
|
||||
features = ['HE.G_plasm_pre', 'HE_pre', 'HE.G_urine_pre', 'HE.GG_pre', 'Total.HE_pre'] + study_features
|
||||
|
||||
X_train, X_test, y_train, y_test= preproc_2(df=main_df, targets=targets, features=features)
|
||||
|
||||
cv_scores_prueba = LGBMfit_CV(X_train, X_test, y_train, y_test)
|
||||
|
||||
r2_scores = cv_scores_prueba["test_r2"]
|
||||
MAE_scores = np.abs(cv_scores_prueba["test_neg_mean_absolute_error"])
|
||||
RMSE_scores = np.abs(cv_scores_prueba["test_neg_root_mean_squared_error"])
|
||||
|
||||
print(r2_scores)
|
||||
print('R2 scores: %.3f (%.3f)' % (np.mean(r2_scores), np.std(r2_scores)))
|
||||
print('MAE scores: %.3f (%.3f)' % (np.mean(MAE_scores), np.std(MAE_scores)))
|
||||
print('RMSE scores: %.3f (%.3f)' % (np.mean(RMSE_scores), np.std(RMSE_scores)))
|
||||
|
||||
###
|
||||
|
||||
|
||||
def XGBfit(X_train, X_test, X_val, y_train, y_test, y_val):
|
||||
def XGBfit_CV_score(X_train, X_test, X_val, y_train, y_test, y_val):
|
||||
# Búsqueda bayesiana de hiperparámetros con optuna
|
||||
# ==============================================================================
|
||||
def objective(trial):
|
||||
|
@ -115,8 +126,8 @@ def XGBfit(X_train, X_test, X_val, y_train, y_test, y_val):
|
|||
**params
|
||||
)
|
||||
model.fit(X_train, y_train)
|
||||
predictions = model.predict(X_val)
|
||||
score = mean_squared_error(y_val, predictions, squared=False)
|
||||
predictions = model.predict(X_test)
|
||||
score = mean_squared_error(y_test, predictions, squared=False)
|
||||
return score
|
||||
|
||||
study = optuna.create_study(direction='minimize')
|
||||
|
@ -138,31 +149,81 @@ def XGBfit(X_train, X_test, X_val, y_train, y_test, y_val):
|
|||
**study.best_params
|
||||
)
|
||||
|
||||
# Entrenamiento del modelo
|
||||
start = time.time()
|
||||
xgb_xgb.fit(X_train, y_train)
|
||||
end = time.time()
|
||||
tiempo_entrenamiento_xgb_xgb = end - start
|
||||
|
||||
# Predicciones test
|
||||
start = time.time()
|
||||
predicciones = xgb_xgb.predict(X=X_test)
|
||||
end = time.time()
|
||||
tiempo_prediccion_xgb_xgb = end - start
|
||||
# Entrenamiento & scores modelo
|
||||
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
|
||||
#lightGMB_model.fit(X_train, y_train)
|
||||
cv_scores = cross_validate(xgb_xgb,
|
||||
pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv = cv,
|
||||
scoring = ("r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"),
|
||||
return_estimator = True,
|
||||
return_indices = True)
|
||||
return (cv_scores)
|
||||
|
||||
# Error de test del modelo
|
||||
rmse_xgb_xgb = mean_squared_error(
|
||||
y_true = y_test,
|
||||
y_pred = predicciones,
|
||||
squared = False
|
||||
)
|
||||
|
||||
print(f"Tiempo entrenamiento: {tiempo_entrenamiento_xgb_xgb:.2f} segundos")
|
||||
print(f"Tiempo predicción: {tiempo_prediccion_xgb_xgb:.2f} segundos")
|
||||
print(f"RMSE: {rmse_xgb_xgb:.2f}")
|
||||
return xgb_xgb
|
||||
|
||||
|
||||
def XGBfit_CV_score(X_train, X_test, X_val, y_train, y_test, y_val):
|
||||
# Búsqueda bayesiana de hiperparámetros con optuna
|
||||
# ==============================================================================
|
||||
def objective(trial):
|
||||
params = {
|
||||
'n_estimators': trial.suggest_int('n_estimators', 10, 1000, step=10),
|
||||
'max_depth': trial.suggest_int('max_depth', 3, 12),
|
||||
'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 5),
|
||||
'learning_rate': trial.suggest_float('learning_rate', 0.00001, 0.01),
|
||||
'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 0.1, log=True),
|
||||
'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 0.1, log=True),
|
||||
'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
|
||||
'subsample': trial.suggest_float('subsample', 0.1, 1),
|
||||
}
|
||||
|
||||
model = XGBRegressor(
|
||||
tree_method = 'hist',
|
||||
eval_metric = 'rmse',
|
||||
n_jobs = -1,
|
||||
random_state = 42,
|
||||
verbose = 0,
|
||||
enable_categorical = True,
|
||||
multi_strategy = "multi_output_tree",
|
||||
device = "cuda",
|
||||
**params
|
||||
)
|
||||
model.fit(X_train, y_train)
|
||||
predictions = model.predict(X_test)
|
||||
score = mean_squared_error(y_test, predictions, squared=False)
|
||||
return score
|
||||
|
||||
study = optuna.create_study(direction='minimize')
|
||||
study.optimize(objective, n_trials=100, show_progress_bar=True, timeout=100*10)
|
||||
|
||||
print('Mejores hiperparámetros:', study.best_params)
|
||||
print('Mejor score:', study.best_value)
|
||||
|
||||
# XGBoost con los mejores hiperparámetros encontrados
|
||||
# ==============================================================================
|
||||
xgb_xgb = XGBRegressor(
|
||||
tree_method = 'hist',
|
||||
eval_metric = 'rmse',
|
||||
n_jobs = -1,
|
||||
random_state = 42,
|
||||
verbose = 0,
|
||||
enable_categorical = True,
|
||||
multi_strategy = "multi_output_tree",
|
||||
**study.best_params
|
||||
)
|
||||
|
||||
|
||||
# Entrenamiento & scores modelo
|
||||
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
|
||||
#lightGMB_model.fit(X_train, y_train)
|
||||
cv_scores = cross_validate(xgb_xgb,
|
||||
pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv = cv,
|
||||
scoring = ("r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"),
|
||||
return_estimator = True,
|
||||
return_indices = True)
|
||||
return (cv_scores)
|
||||
|
||||
def RFfit(X_train, X_test, X_val, y_train, y_test, y_val):
|
||||
'''
|
||||
function to train a Random Forest
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from modelSanoApp import exec_models, preproc
|
||||
from fit_models import RFfit_lgbm
|
||||
from fit_models_CV import XGBfit_CV_score
|
||||
from utils import preproc_2
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
|
@ -7,45 +7,52 @@ import time
|
|||
main_df = pd.read_csv(os.path.join("..", "data", "final_merge.csv"), sep =";", decimal= ".")
|
||||
main_df[['Sex', 'Sweetener']] = main_df[['Sex', 'Sweetener']].astype("category")
|
||||
|
||||
# define cases
|
||||
|
||||
def testing_funct(df, dir_name, targets, features, model_func):
|
||||
|
||||
print(" ----------------- STARTING "+ dir_name + "----------------- ")
|
||||
|
||||
|
||||
directory = os.path.join("../results/test_cases", dir_name)
|
||||
if not os.path.exists(directory):
|
||||
os.mkdir(directory)
|
||||
|
||||
|
||||
|
||||
X_train, X_test, y_train, y_test = preproc_2(df=df, targets=targets, features=features)
|
||||
|
||||
cv_scores = model_func(X_train, X_test, y_train, y_test)
|
||||
|
||||
with open(os.path.join(directory,"prueba_"+dir_name+"_modelo.pkl"), "wb") as f:
|
||||
pickle.dump(cv_scores, f, protocol = 5)
|
||||
|
||||
print('Mejor score:', study.best_value)
|
||||
r2_scores = cv_scores_prueba["test_r2"]
|
||||
MAE_scores = np.abs(cv_scores_prueba["test_neg_mean_absolute_error"])
|
||||
RMSE_scores = np.abs(cv_scores_prueba["test_neg_root_mean_squared_error"])
|
||||
|
||||
print(r2_scores)
|
||||
print('R2 scores: %.3f (%.3f)' % (np.mean(r2_scores), np.std(r2_scores)))
|
||||
print('MAE scores: %.3f (%.3f)' % (np.mean(MAE_scores), np.std(MAE_scores)))
|
||||
print('RMSE scores: %.3f (%.3f)' % (np.mean(RMSE_scores), np.std(RMSE_scores)))
|
||||
|
||||
|
||||
study_features = ["Sex", "Sweetener"]
|
||||
|
||||
directory = "../results/test_cases"
|
||||
# VA-GG case
|
||||
|
||||
targets = ['VA.GG_plasm_post']
|
||||
features = ['VA_plasm_pre','VA.GG_plasm_pre', 'VA.S_pre', 'VA.GS_plasm_pre', 'VA.SS_plasm_pre', 'Total.VA_plasm_pre'] + study_features
|
||||
|
||||
testing_funct (df = main_df, dir_name = "VA-GG_case", targets=targets, features=features, model_func= XGBfit_CV_score)
|
||||
|
||||
|
||||
print(" ----------------- STARTING "+ directory + "----------------- ")
|
||||
VA_GG = False
|
||||
if VA_GG:
|
||||
if not os.path.exists(directory):
|
||||
os.mkdir(directory)
|
||||
targets = ['VA.GG_plasm_post']
|
||||
features = ['VA_plasm_pre','VA.GG_plasm_pre', 'VA.S_pre', 'VA.GS_plasm_pre', 'VA.SS_plasm_pre', 'Total.VA_plasm_pre'] + study_features
|
||||
# N-G case
|
||||
|
||||
model_list, X_test, y_test = exec_models(main_df, features=features, targets=targets, multiple=False, directory=directory+"/")
|
||||
target = ['N.G_urine_post']
|
||||
features = ['N.G_plasm_pre', 'N_pre','N.G_urine_pre', 'N.GG_pre', 'N.S_pre', 'Total.N_pre'] + study_features
|
||||
|
||||
X_test_sweetShifted = X_test.replace(["SA", "ST", "SU"], ["SU", "SA", "ST"])
|
||||
|
||||
pred_xgb = pd.Series(model_list[0].predict(X_test_sweetShifted), index = y_test.index, name = "xgb_pred")
|
||||
pred_rf = pd.Series(model_list[1].predict(X_test_sweetShifted), index = y_test.index, name = "rf_pred")
|
||||
pred_lgbm = pd.Series(model_list[2].predict(X_test_sweetShifted), index = y_test.index, name = "lgbm_pred")
|
||||
|
||||
pd.concat([y_test, pred_xgb, pred_rf, pred_lgbm, X_test["Sweetener"], X_test_sweetShifted["Sweetener"]], axis = 1).to_csv(directory+"/prueba_predSweet_VA-GG.csv", sep = ";")
|
||||
testing_funct (df = main_df, dir_name = "N-G_case", targets=targets, features=features, model_func= XGBfit_CV_score)
|
||||
|
||||
NG = True
|
||||
|
||||
if NG:
|
||||
if not os.path.exists(directory):
|
||||
os.mkdir(directory)
|
||||
target = ['N.G_urine_post']
|
||||
features = ['N.G_plasm_pre', 'N_pre','N.G_urine_pre', 'N.GG_pre', 'N.S_pre', 'Total.N_pre'] + study_features
|
||||
|
||||
X_train, X_test, X_val, y_train, y_test, y_val = preproc(df=main_df, targets=target, features=features)
|
||||
|
||||
rf_model = RFfit_lgbm(X_train, X_test, X_val, y_train, y_test, y_val)
|
||||
|
||||
X_test_sexShifted = X_test.replace(["MAN", "WOMAN"], ["WOMAN", "MAN"]).
|
||||
|
||||
#pred_xgb = pd.Series(model_list[0].predict(X_test_sexShifted), index = y_test.index, name = "xgb_pred")
|
||||
pred_rf = pd.Series(rf_model.predict(X_test_sexShifted), index = y_test.index, name = "rf_pred")
|
||||
#pred_lgbm = pd.Series(model_list[2].predict(X_test_sexShifted), index = y_test.index, name = "lgbm_pred")
|
||||
|
||||
pd.concat([y_test, pred_rf, X_test["Sex"], X_test_sexShifted["Sex"]] , axis = 1).to_csv(directory+"/prueba_predSex_NG.csv", sep = ";")
|
||||
|
|