-
Notifications
You must be signed in to change notification settings - Fork 13
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
If the hyperparameter 'loss' is' exponential 'in the GradientBoostingClassifier, the AdaBoost algorithm is applied. AdaBoost makes a weak learner that classifies two classes, but since target is multiclass, ValueError occurs.
To Reproduce
Steps to reproduce the behavior:
- Show your code calling
generate_code().
script
cls = SapientML(
target_columns=["species"],
add_explanation=True,
split_train_size=0.75,
hyperparameter_tuning=True,
hyperparameter_tuning_n_trials=10,
hyperparameter_tuning_timeout=600,
)
model = cls.fit(train_data_all).model-
Attach the datasets or dataframes input to
generate_code()if possible.
https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html -
Show the generated code such as
1_default.pywhen it was generated.
generated code
# *** GENERATED PIPELINE ***
# LOAD DATA
import pandas as pd
train_dataset = pd.read_pickle(r"/home/sugawara/PoC/mobilePF/outputs/training.pkl")
# TRAIN-TEST SPLIT
from sklearn.model_selection import train_test_split
def split_dataset(dataset, train_size=0.75, random_state=17):
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state)
return train_dataset, test_dataset
train_dataset, test_dataset = split_dataset(train_dataset)
train_dataset, validation_dataset = split_dataset(train_dataset)
# SUBSAMPLE
# If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup.
from lib.sample_dataset import sample_dataset
train_dataset = sample_dataset(
dataframe=train_dataset,
sample_size=100000,
target_columns=['species'],
task_type='classification'
)
test_dataset = validation_dataset
# DETACH TARGET
TARGET_COLUMNS = ['species']
feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
target_train = train_dataset[TARGET_COLUMNS].copy()
feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
target_test = test_dataset[TARGET_COLUMNS].copy()
# HYPERPARAMETER OPTIMIZATION
import optuna
from sklearn.ensemble import GradientBoostingClassifier
# NEED CV: ex.) optuna.integration.OptunaSearchCV()
class Objective(object):
def __init__(self, feature_train, target_train, feature_test, target_test, __random_state):
self.feature_train = feature_train
self.target_train = target_train
self.feature_test = feature_test
self.target_test = target_test
self.__random_state = __random_state
def __call__(self, trial):
def set_hyperparameters(trial):
params = {}
params['loss'] = trial.suggest_categorical('loss', ['log_loss', 'deviance', 'exponential']) # log_loss
params['n_estimators'] = trial.suggest_int('n_estimators', 10, 1000, log=True) # 100
params['subsample'] = trial.suggest_float('subsample', 0.2, 1) # 1
params['criterion'] = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error']) # 'friedman_mse'
params['min_samples_leaf'] = trial.suggest_int('min_samples_leaf', 1, 32, log=True) # 1
params['max_features'] = trial.suggest_categorical('max_features', ['sqrt','log2', None]) # None
return params
# SET DATA
import numpy as np
if isinstance(self.feature_train, pd.DataFrame):
feature_train = self.feature_train
elif isinstance(self.feature_train, np.ndarray):
feature_train = pd.DataFrame(self.feature_train)
else:
feature_train = pd.DataFrame(self.feature_train.toarray())
if isinstance(self.target_train, pd.DataFrame):
target_train = self.target_train
elif isinstance(self.target_train, np.ndarray):
target_train = pd.DataFrame(self.target_train)
else:
target_train = pd.DataFrame(self.target_train.toarray())
if isinstance(self.feature_test, pd.DataFrame):
feature_test = self.feature_test
elif isinstance(self.feature_test, np.ndarray):
feature_test = pd.DataFrame(self.feature_test)
else:
feature_test = pd.DataFrame(self.feature_test.toarray())
if isinstance(self.target_test, pd.DataFrame):
target_test = self.target_test
elif isinstance(self.target_test, np.ndarray):
target_test = pd.DataFrame(self.target_test)
else:
target_test = pd.DataFrame(self.target_test.toarray())
# MODEL
params = set_hyperparameters(trial)
model = GradientBoostingClassifier(random_state=self.__random_state, **params)
model.fit(feature_train, target_train.values.ravel())
y_pred = model.predict(feature_test)
from sklearn import metrics
score = metrics.f1_score(target_test, y_pred, average='macro')
return score
n_trials = 10
timeout = 600
random_state = 1023
random_state_model = 42
direction = 'maximize'
study = optuna.create_study(direction=direction,
sampler=optuna.samplers.TPESampler(seed=random_state))
default_hyperparameters = {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0}
study.enqueue_trial(default_hyperparameters)
study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model),
n_trials=n_trials,
timeout=timeout)
best_params = study.best_params
print("best params:", best_params)
print("RESULT: f1: " + str(study.best_value))- Show the messages of SapientML and/or generated code.
ValueError: ExponentialLoss requires 2 classes; got 3 class(es)
Expected behavior
Environment (please complete the following information):
- SapientML Version: 0.4.12.post0
Additional context
If the target has multiclass, the parameter "loss" must be set to "log_loss" only.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working