blablabla

This commit is contained in:
kubrik 2024-07-17 15:23:06 +02:00
parent ea88fb26af
commit 84203724ce
31 changed files with 157 additions and 89 deletions

Binary file not shown.

View file

@ -1,13 +1,50 @@
from xgboost import XGBRegressor, XGBRFRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedKFold, GridSearchCV
import pandas as pd
import pickle
import optuna
import time
import numpy as np
def LGBM_GCV(X_train, X_test, y_train, y_test):
params = {
'n_estimators': [100, 500, 1000],
'max_depth': [1, 5, 7],#[3, 5, 7, 12],
'learning_rate': [0.0001, 0.1], #[0.000001, 0.001, 0.01, 0.1],
'colsample_bynode': [0.01, 1], #[0.001, 0.1, 1],
'subsample': [0.01, 0.1, 1], # [0.001, 0.1, 1],
'boosting_type': ['gbdt', 'dart'],
'num_leaves': [1024, 2048]
}
model = LGBMRegressor(
n_jobs = -1,
random_state = 42,
verbose = 0
)
print(" ----------------- SETTING UP TRAINING ----------------- ")
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = GridSearchCV(estimator = model, param_grid = params, cv = cv, n_jobs = -1, verbose = 0)
print(" ----------------- STARTING GRIDSEARCH ----------------- ")
grid_search.fit(X_train, y_train)
cv_scores = cross_validate(grid_search.best_estimator_,
x = X_test, y = y_test, cv = cv,
scoring = ("r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"),
return_estimator = True,
return_indices = True)
return (cv_scores)
def LGBMfit_CV(X_train, X_test, y_train, y_test):
# Búsqueda bayesiana de hiperparámetros con optuna
@ -30,7 +67,7 @@ def LGBMfit_CV(X_train, X_test, y_train, y_test):
)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = np.abs(r2_score(y_test, predictions))
score = r2_score(y_test, predictions)
return score
study = optuna.create_study(direction='maximize')
@ -49,7 +86,7 @@ def LGBMfit_CV(X_train, X_test, y_train, y_test):
)
# Entrenamiento & scores modelo
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
#lightGMB_model.fit(X_train, y_train)
cv_scores = cross_validate(lightGMB_model,
pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv = cv,
@ -63,33 +100,7 @@ def LGBMfit_CV(X_train, X_test, y_train, y_test):
return cv_scores
### test area
from utils import preproc_2
import os
main_df = pd.read_csv(os.path.join("..", "data", "final_merge.csv"), sep =";", decimal= ".")
main_df[['Sex', 'Sweetener']] = main_df[['Sex', 'Sweetener']].astype("category")
study_features = ["Sex", "Sweetener"]
targets = ['HE_post']
features = ['HE.G_plasm_pre', 'HE_pre', 'HE.G_urine_pre', 'HE.GG_pre', 'Total.HE_pre'] + study_features
X_train, X_test, y_train, y_test= preproc_2(df=main_df, targets=targets, features=features)
cv_scores_prueba = LGBMfit_CV(X_train, X_test, y_train, y_test)
r2_scores = cv_scores_prueba["test_r2"]
MAE_scores = np.abs(cv_scores_prueba["test_neg_mean_absolute_error"])
RMSE_scores = np.abs(cv_scores_prueba["test_neg_root_mean_squared_error"])
print(r2_scores)
print('R2 scores: %.3f (%.3f)' % (np.mean(r2_scores), np.std(r2_scores)))
print('MAE scores: %.3f (%.3f)' % (np.mean(MAE_scores), np.std(MAE_scores)))
print('RMSE scores: %.3f (%.3f)' % (np.mean(RMSE_scores), np.std(RMSE_scores)))
###
def XGBfit(X_train, X_test, X_val, y_train, y_test, y_val):
def XGBfit_CV_score(X_train, X_test, X_val, y_train, y_test, y_val):
# Búsqueda bayesiana de hiperparámetros con optuna
# ==============================================================================
def objective(trial):
@ -115,8 +126,8 @@ def XGBfit(X_train, X_test, X_val, y_train, y_test, y_val):
**params
)
model.fit(X_train, y_train)
predictions = model.predict(X_val)
score = mean_squared_error(y_val, predictions, squared=False)
predictions = model.predict(X_test)
score = mean_squared_error(y_test, predictions, squared=False)
return score
study = optuna.create_study(direction='minimize')
@ -138,31 +149,81 @@ def XGBfit(X_train, X_test, X_val, y_train, y_test, y_val):
**study.best_params
)
# Entrenamiento del modelo
start = time.time()
xgb_xgb.fit(X_train, y_train)
end = time.time()
tiempo_entrenamiento_xgb_xgb = end - start
# Predicciones test
start = time.time()
predicciones = xgb_xgb.predict(X=X_test)
end = time.time()
tiempo_prediccion_xgb_xgb = end - start
# Entrenamiento & scores modelo
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
#lightGMB_model.fit(X_train, y_train)
cv_scores = cross_validate(xgb_xgb,
pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv = cv,
scoring = ("r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"),
return_estimator = True,
return_indices = True)
return (cv_scores)
# Error de test del modelo
rmse_xgb_xgb = mean_squared_error(
y_true = y_test,
y_pred = predicciones,
squared = False
)
print(f"Tiempo entrenamiento: {tiempo_entrenamiento_xgb_xgb:.2f} segundos")
print(f"Tiempo predicción: {tiempo_prediccion_xgb_xgb:.2f} segundos")
print(f"RMSE: {rmse_xgb_xgb:.2f}")
return xgb_xgb
def XGBfit_CV_score(X_train, X_test, X_val, y_train, y_test, y_val):
# Búsqueda bayesiana de hiperparámetros con optuna
# ==============================================================================
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 10, 1000, step=10),
'max_depth': trial.suggest_int('max_depth', 3, 12),
'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 5),
'learning_rate': trial.suggest_float('learning_rate', 0.00001, 0.01),
'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 0.1, log=True),
'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 0.1, log=True),
'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
'subsample': trial.suggest_float('subsample', 0.1, 1),
}
model = XGBRegressor(
tree_method = 'hist',
eval_metric = 'rmse',
n_jobs = -1,
random_state = 42,
verbose = 0,
enable_categorical = True,
multi_strategy = "multi_output_tree",
device = "cuda",
**params
)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = mean_squared_error(y_test, predictions, squared=False)
return score
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True, timeout=100*10)
print('Mejores hiperparámetros:', study.best_params)
print('Mejor score:', study.best_value)
# XGBoost con los mejores hiperparámetros encontrados
# ==============================================================================
xgb_xgb = XGBRegressor(
tree_method = 'hist',
eval_metric = 'rmse',
n_jobs = -1,
random_state = 42,
verbose = 0,
enable_categorical = True,
multi_strategy = "multi_output_tree",
**study.best_params
)
# Entrenamiento & scores modelo
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
#lightGMB_model.fit(X_train, y_train)
cv_scores = cross_validate(xgb_xgb,
pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv = cv,
scoring = ("r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"),
return_estimator = True,
return_indices = True)
return (cv_scores)
def RFfit(X_train, X_test, X_val, y_train, y_test, y_val):
'''
function to train a Random Forest

Binary file not shown.

View file

@ -1,5 +1,5 @@
from modelSanoApp import exec_models, preproc
from fit_models import RFfit_lgbm
from fit_models_CV import XGBfit_CV_score
from utils import preproc_2
import pandas as pd
import os
import time
@ -7,45 +7,52 @@ import time
main_df = pd.read_csv(os.path.join("..", "data", "final_merge.csv"), sep =";", decimal= ".")
main_df[['Sex', 'Sweetener']] = main_df[['Sex', 'Sweetener']].astype("category")
# define cases
def testing_funct(df, dir_name, targets, features, model_func):
print(" ----------------- STARTING "+ dir_name + "----------------- ")
directory = os.path.join("../results/test_cases", dir_name)
if not os.path.exists(directory):
os.mkdir(directory)
X_train, X_test, y_train, y_test = preproc_2(df=df, targets=targets, features=features)
cv_scores = model_func(X_train, X_test, y_train, y_test)
with open(os.path.join(directory,"prueba_"+dir_name+"_modelo.pkl"), "wb") as f:
pickle.dump(cv_scores, f, protocol = 5)
print('Mejor score:', study.best_value)
r2_scores = cv_scores_prueba["test_r2"]
MAE_scores = np.abs(cv_scores_prueba["test_neg_mean_absolute_error"])
RMSE_scores = np.abs(cv_scores_prueba["test_neg_root_mean_squared_error"])
print(r2_scores)
print('R2 scores: %.3f (%.3f)' % (np.mean(r2_scores), np.std(r2_scores)))
print('MAE scores: %.3f (%.3f)' % (np.mean(MAE_scores), np.std(MAE_scores)))
print('RMSE scores: %.3f (%.3f)' % (np.mean(RMSE_scores), np.std(RMSE_scores)))
study_features = ["Sex", "Sweetener"]
directory = "../results/test_cases"
# VA-GG case
targets = ['VA.GG_plasm_post']
features = ['VA_plasm_pre','VA.GG_plasm_pre', 'VA.S_pre', 'VA.GS_plasm_pre', 'VA.SS_plasm_pre', 'Total.VA_plasm_pre'] + study_features
testing_funct (df = main_df, dir_name = "VA-GG_case", targets=targets, features=features, model_func= XGBfit_CV_score)
print(" ----------------- STARTING "+ directory + "----------------- ")
VA_GG = False
if VA_GG:
if not os.path.exists(directory):
os.mkdir(directory)
targets = ['VA.GG_plasm_post']
features = ['VA_plasm_pre','VA.GG_plasm_pre', 'VA.S_pre', 'VA.GS_plasm_pre', 'VA.SS_plasm_pre', 'Total.VA_plasm_pre'] + study_features
# N-G case
model_list, X_test, y_test = exec_models(main_df, features=features, targets=targets, multiple=False, directory=directory+"/")
target = ['N.G_urine_post']
features = ['N.G_plasm_pre', 'N_pre','N.G_urine_pre', 'N.GG_pre', 'N.S_pre', 'Total.N_pre'] + study_features
X_test_sweetShifted = X_test.replace(["SA", "ST", "SU"], ["SU", "SA", "ST"])
pred_xgb = pd.Series(model_list[0].predict(X_test_sweetShifted), index = y_test.index, name = "xgb_pred")
pred_rf = pd.Series(model_list[1].predict(X_test_sweetShifted), index = y_test.index, name = "rf_pred")
pred_lgbm = pd.Series(model_list[2].predict(X_test_sweetShifted), index = y_test.index, name = "lgbm_pred")
pd.concat([y_test, pred_xgb, pred_rf, pred_lgbm, X_test["Sweetener"], X_test_sweetShifted["Sweetener"]], axis = 1).to_csv(directory+"/prueba_predSweet_VA-GG.csv", sep = ";")
testing_funct (df = main_df, dir_name = "N-G_case", targets=targets, features=features, model_func= XGBfit_CV_score)
NG = True
if NG:
if not os.path.exists(directory):
os.mkdir(directory)
target = ['N.G_urine_post']
features = ['N.G_plasm_pre', 'N_pre','N.G_urine_pre', 'N.GG_pre', 'N.S_pre', 'Total.N_pre'] + study_features
X_train, X_test, X_val, y_train, y_test, y_val = preproc(df=main_df, targets=target, features=features)
rf_model = RFfit_lgbm(X_train, X_test, X_val, y_train, y_test, y_val)
X_test_sexShifted = X_test.replace(["MAN", "WOMAN"], ["WOMAN", "MAN"]).
#pred_xgb = pd.Series(model_list[0].predict(X_test_sexShifted), index = y_test.index, name = "xgb_pred")
pred_rf = pd.Series(rf_model.predict(X_test_sexShifted), index = y_test.index, name = "rf_pred")
#pred_lgbm = pd.Series(model_list[2].predict(X_test_sexShifted), index = y_test.index, name = "lgbm_pred")
pd.concat([y_test, pred_rf, X_test["Sex"], X_test_sexShifted["Sex"]] , axis = 1).to_csv(directory+"/prueba_predSex_NG.csv", sep = ";")