123 lines
3.5 KiB
Python
123 lines
3.5 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.experimental import enable_iterative_imputer
|
|
from sklearn.impute import IterativeImputer
|
|
from xgboost import XGBRegressor
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
def preproc_2 (df, features, targets):
|
|
'''
|
|
func to impute and split data into train/test
|
|
'''
|
|
|
|
if type(targets) == str:
|
|
featplustargets = features
|
|
featplustargets.append(targets)
|
|
print(featplustargets)
|
|
else:
|
|
featplustargets = features+targets
|
|
|
|
to_imput = df[df[featplustargets].select_dtypes(include=np.number).columns]
|
|
not_imput = df.columns.drop(df[featplustargets].select_dtypes(include=np.number).columns)
|
|
|
|
iimp = IterativeImputer(
|
|
estimator = XGBRegressor(),
|
|
random_state = 42,
|
|
verbose = 0,
|
|
)
|
|
|
|
iimp.set_output(transform="pandas")
|
|
|
|
df_imp = iimp.fit_transform(to_imput)
|
|
df_imp[not_imput] = df[not_imput]
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
df[features],
|
|
df_imp[targets],
|
|
train_size = 0.8,
|
|
random_state = 42,
|
|
shuffle = True
|
|
)
|
|
|
|
print("Observaciones en train:", X_train.shape)
|
|
print("Observaciones en test:", X_test.shape)
|
|
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
|
|
def scaling(df_read):
|
|
|
|
import numpy as np
|
|
from sklearn import preprocessing
|
|
scaler = preprocessing.PowerTransformer()
|
|
# scaler = preprocessing.MinMaxScaler()
|
|
# scaler = preprocessing.RobustScaler()
|
|
|
|
numCols = df_read.select_dtypes(include=np.number).drop("numVol", axis=1).columns
|
|
scaler.fit(df_read[numCols])
|
|
df_read[numCols] = scaler.transform(df_read[numCols])
|
|
|
|
return (df_read, scaler)
|
|
|
|
|
|
def preproc (df, features, targets):
|
|
'''
|
|
func to impute and split data
|
|
'''
|
|
|
|
if type(targets) == str:
|
|
featplustargets = features
|
|
featplustargets.append(targets)
|
|
print(featplustargets)
|
|
else:
|
|
featplustargets = features+targets
|
|
|
|
to_imput = df[df[featplustargets].select_dtypes(include=np.number).columns]
|
|
not_imput = df.columns.drop(df[featplustargets].select_dtypes(include=np.number).columns)
|
|
|
|
iimp = IterativeImputer(
|
|
estimator = XGBRegressor(),
|
|
random_state = 42,
|
|
verbose = 0,
|
|
)
|
|
|
|
iimp.set_output(transform="pandas")
|
|
|
|
df_imp = iimp.fit_transform(to_imput)
|
|
df_imp[not_imput] = df[not_imput]
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
df[features],
|
|
df_imp[targets],
|
|
train_size = 0.8,
|
|
random_state = 42,
|
|
shuffle = True
|
|
)
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train,
|
|
y_train,
|
|
train_size = 0.8,
|
|
random_state = 42,
|
|
shuffle = True
|
|
)
|
|
|
|
print("Observaciones en train:", X_train.shape)
|
|
print("Observaciones en validation:", X_val.shape)
|
|
print("Observaciones en test:", X_test.shape)
|
|
|
|
return X_train, X_test, X_val, y_train, y_test, y_val
|
|
|
|
def scaling(df_read):
|
|
|
|
import numpy as np
|
|
from sklearn import preprocessing
|
|
scaler = preprocessing.PowerTransformer()
|
|
# scaler = preprocessing.MinMaxScaler()
|
|
# scaler = preprocessing.RobustScaler()
|
|
|
|
numCols = df_read.select_dtypes(include=np.number).drop("numVol", axis=1).columns
|
|
scaler.fit(df_read[numCols])
|
|
df_read[numCols] = scaler.transform(df_read[numCols])
|
|
|
|
return (df_read, scaler)
|