modelSanoApp/scripts/utils.py

123 lines
3.5 KiB
Python

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
def preproc_2 (df, features, targets):
'''
func to impute and split data into train/test
'''
if type(targets) == str:
featplustargets = features
featplustargets.append(targets)
print(featplustargets)
else:
featplustargets = features+targets
to_imput = df[df[featplustargets].select_dtypes(include=np.number).columns]
not_imput = df.columns.drop(df[featplustargets].select_dtypes(include=np.number).columns)
iimp = IterativeImputer(
estimator = XGBRegressor(),
random_state = 42,
verbose = 0,
)
iimp.set_output(transform="pandas")
df_imp = iimp.fit_transform(to_imput)
df_imp[not_imput] = df[not_imput]
X_train, X_test, y_train, y_test = train_test_split(
df[features],
df_imp[targets],
train_size = 0.8,
random_state = 42,
shuffle = True
)
print("Observaciones en train:", X_train.shape)
print("Observaciones en test:", X_test.shape)
return X_train, X_test, y_train, y_test
def scaling(df_read):
import numpy as np
from sklearn import preprocessing
scaler = preprocessing.PowerTransformer()
# scaler = preprocessing.MinMaxScaler()
# scaler = preprocessing.RobustScaler()
numCols = df_read.select_dtypes(include=np.number).drop("numVol", axis=1).columns
scaler.fit(df_read[numCols])
df_read[numCols] = scaler.transform(df_read[numCols])
return (df_read, scaler)
def preproc (df, features, targets):
'''
func to impute and split data
'''
if type(targets) == str:
featplustargets = features
featplustargets.append(targets)
print(featplustargets)
else:
featplustargets = features+targets
to_imput = df[df[featplustargets].select_dtypes(include=np.number).columns]
not_imput = df.columns.drop(df[featplustargets].select_dtypes(include=np.number).columns)
iimp = IterativeImputer(
estimator = XGBRegressor(),
random_state = 42,
verbose = 0,
)
iimp.set_output(transform="pandas")
df_imp = iimp.fit_transform(to_imput)
df_imp[not_imput] = df[not_imput]
X_train, X_test, y_train, y_test = train_test_split(
df[features],
df_imp[targets],
train_size = 0.8,
random_state = 42,
shuffle = True
)
X_train, X_val, y_train, y_val = train_test_split(
X_train,
y_train,
train_size = 0.8,
random_state = 42,
shuffle = True
)
print("Observaciones en train:", X_train.shape)
print("Observaciones en validation:", X_val.shape)
print("Observaciones en test:", X_test.shape)
return X_train, X_test, X_val, y_train, y_test, y_val
def scaling(df_read):
import numpy as np
from sklearn import preprocessing
scaler = preprocessing.PowerTransformer()
# scaler = preprocessing.MinMaxScaler()
# scaler = preprocessing.RobustScaler()
numCols = df_read.select_dtypes(include=np.number).drop("numVol", axis=1).columns
scaler.fit(df_read[numCols])
df_read[numCols] = scaler.transform(df_read[numCols])
return (df_read, scaler)