# *** GENERATED PIPELINE ***
# LOAD DATA
import pandas as pd
train_dataset = pd.read_pickle(r"/outputs/training.pkl")
# TRAIN-TEST SPLIT
from sklearn.model_selection import train_test_split
def split_dataset(dataset, train_size=0.75, random_state=17):
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state)
return train_dataset, test_dataset
train_dataset, test_dataset = split_dataset(train_dataset)
train_dataset, validation_dataset = split_dataset(train_dataset)
# SUBSAMPLE
# If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup.
from lib.sample_dataset import sample_dataset
train_dataset = sample_dataset(
dataframe=train_dataset,
sample_size=100000,
target_columns=['SalePrice'],
task_type='regression'
)
test_dataset = validation_dataset
# PREPROCESSING-1
# Component: Preprocess:SimpleImputer
# Efficient Cause: Preprocess:SimpleImputer is required in this pipeline since the dataset has ['feature:missing_values_presence']. The relevant features are: ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'PoolQC'].
# Purpose: Imputation transformer for completing missing values
# Form:
# Input: array of shape (n_features,)
# Key hyperparameters used:
# "missing_values: int, float, str, np.nan or None, default=np.nan" :: The placeholder for the missing values. All occurrences of missing_values will be imputed. For pandas’ dataframes with nullable integer dtypes with missing values, missing_values should be set to np.nan, since pd.NA will be converted to np.nan.
# "strategy: str, default='mean'" :: The imputation strategy. If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
# Alternatives: Although can also be used for this dataset, Preprocess:SimpleImputer is used because it has more than .
# Order: Preprocess:SimpleImputer should be applied
import numpy as np
from sklearn.impute import SimpleImputer
NUMERIC_COLS_WITH_MISSING_VALUES = ['GarageYrBlt', 'LotFrontage', 'MasVnrArea']
simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_dataset[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform(train_dataset[NUMERIC_COLS_WITH_MISSING_VALUES])
test_dataset[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform(test_dataset[NUMERIC_COLS_WITH_MISSING_VALUES])
# PREPROCESSING-2
import numpy as np
from sklearn.impute import SimpleImputer
STRING_COLS_WITH_MISSING_VALUES = ['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'MasVnrType']
simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_dataset[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform(train_dataset[STRING_COLS_WITH_MISSING_VALUES])
test_dataset[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform(test_dataset[STRING_COLS_WITH_MISSING_VALUES])
STRING_ALMOST_MISSING_COLS = ['Alley', 'Fence', 'MiscFeature', 'PoolQC']
train_dataset[STRING_ALMOST_MISSING_COLS] = train_dataset[STRING_ALMOST_MISSING_COLS].astype(str)
test_dataset[STRING_ALMOST_MISSING_COLS] = test_dataset[STRING_ALMOST_MISSING_COLS].astype(str)
train_dataset[STRING_ALMOST_MISSING_COLS] = train_dataset[STRING_ALMOST_MISSING_COLS].fillna('')
test_dataset[STRING_ALMOST_MISSING_COLS] = test_dataset[STRING_ALMOST_MISSING_COLS].fillna('')
# PREPROCESSING-3
# Component: Preprocess:OrdinalEncoder
# Efficient Cause: Preprocess:OrdinalEncoder is required in this pipeline since the dataset has ['feature:str_category_presence', 'feature:str_category_small_presence', 'feature:str_category_binary_presence']. The relevant features are: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'].
# Purpose: Encode categorical features as an integer array
# Form:
# Input: list of arrays
# Key hyperparameters used:
# "handle_unknown: {'error', 'use_encoded_value'}, default='error'" :: When set to ‘error’ an error will be raised in case an unknown categorical feature is present during transform. When set to ‘use_encoded_value’, the encoded value of unknown categories will be set to the value given for the parameter unknown_value. In inverse_transform, an unknown category will be denoted as None.
# "unknown_value: int or np.nan, default=None" :: When the parameter handle_unknown is set to ‘use_encoded_value’, this parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of the categories in fit. If set to np.nan, the dtype parameter must be a float dtype.
# Alternatives: Although [Preprocess:OneHotEncoder] can also be used for this dataset, Preprocess:OrdinalEncoder is used because it has more feature:str_category_small_presence than feature:str_category_binary_presence.
# Order: Preprocess:OrdinalEncoder should be applied
from sklearn.preprocessing import OrdinalEncoder
CATEGORICAL_COLS = ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street', 'Utilities']
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
train_dataset[CATEGORICAL_COLS] = ordinal_encoder.fit_transform(train_dataset[CATEGORICAL_COLS])
test_dataset[CATEGORICAL_COLS] = ordinal_encoder.transform(test_dataset[CATEGORICAL_COLS])
# PREPROCESSING-4
# Component: Preprocess:Log
# Efficient Cause: Preprocess:Log is required in this pipeline since the dataset has ['feature:target_imbalance_score', 'feature:normalized_variation_across_columns', 'feature:max_normalized_stddev']. The relevant features are: ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'MoSold', 'YrSold', 'SalePrice'].
# Purpose: Return the natural logarithm of one plus the input array, element-wise.
# Form:
# Input: array_like
# Key hyperparameters used: None
# Alternatives: Although [Preprocess:StandardScaler] can also be used for this dataset, Preprocess:Log is used because it has more feature:target_imbalance_score than feature:max_skewness.
# Order: Preprocess:Log should be applied
import numpy as np
NUMERIC_COLS_TO_SCALE = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'MoSold', 'YrSold', 'SalePrice']
train_dataset[NUMERIC_COLS_TO_SCALE] = np.log1p(train_dataset[NUMERIC_COLS_TO_SCALE])
NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE))
test_dataset[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p(test_dataset[NUMERIC_COLS_TO_SCALE_FOR_TEST])
# DETACH TARGET
TARGET_COLUMNS = ['SalePrice']
feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
target_train = train_dataset[TARGET_COLUMNS].copy()
feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
target_test = test_dataset[TARGET_COLUMNS].copy()
# HYPERPARAMETER OPTIMIZATION
import optuna
from sklearn.neural_network import MLPRegressor
# NEED CV: ex.) optuna.integration.OptunaSearchCV()
class Objective(object):
def __init__(self, feature_train, target_train, feature_test, target_test, __random_state):
self.feature_train = feature_train
self.target_train = target_train
self.feature_test = feature_test
self.target_test = target_test
self.__random_state = __random_state
def __call__(self, trial):
def set_hyperparameters(trial):
params = {}
params['activation'] = trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']) # relu
params['solver'] = trial.suggest_categorical('solver', ['lbfgs','sgd', 'adam']) # adam
params['alpha'] = trial.suggest_loguniform('alpha', 1e-6, 1.0) # 0.0001
return params
# SET DATA
import numpy as np
if isinstance(self.feature_train, pd.DataFrame):
feature_train = self.feature_train
elif isinstance(self.feature_train, np.ndarray):
feature_train = pd.DataFrame(self.feature_train)
else:
feature_train = pd.DataFrame(self.feature_train.toarray())
if isinstance(self.target_train, pd.DataFrame):
target_train = self.target_train
elif isinstance(self.target_train, np.ndarray):
target_train = pd.DataFrame(self.target_train)
else:
target_train = pd.DataFrame(self.target_train.toarray())
if isinstance(self.feature_test, pd.DataFrame):
feature_test = self.feature_test
elif isinstance(self.feature_test, np.ndarray):
feature_test = pd.DataFrame(self.feature_test)
else:
feature_test = pd.DataFrame(self.feature_test.toarray())
if isinstance(self.target_test, pd.DataFrame):
target_test = self.target_test.copy()
elif isinstance(self.target_test, np.ndarray):
target_test = pd.DataFrame(self.target_test)
else:
target_test = pd.DataFrame(self.target_test.toarray())
# MODEL
params = set_hyperparameters(trial)
model = MLPRegressor(random_state=self.__random_state, **params)
model.fit(feature_train, target_train.values.ravel())
y_pred = model.predict(feature_test)
# INVERSE TARGET
import numpy as np
COLS_TO_BE_INVERSED = list(set(NUMERIC_COLS_TO_SCALE) & set(TARGET_COLUMNS))
target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED])
y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
y_pred[COLS_TO_BE_INVERSED] = np.expm1(y_pred[COLS_TO_BE_INVERSED])
y_pred = y_pred.to_numpy()
from sklearn import metrics
score = metrics.r2_score(target_test, y_pred)
return score
n_trials = 10
timeout = 600
random_state = 1023
random_state_model = 42
direction = 'maximize'
study = optuna.create_study(direction=direction,
sampler=optuna.samplers.TPESampler(seed=random_state))
default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'}
study.enqueue_trial(default_hyperparameters)
study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model),
n_trials=n_trials,
timeout=timeout)
best_params = study.best_params
print("best params:", best_params)
print("RESULT: r2: " + str(study.best_value))