Skip to content

A ValueError occurs during hyperparameter tuning in the candidate script using GradientBoosting #64

@mariko-sugawara

Description

@mariko-sugawara

Describe the bug
If the hyperparameter 'loss' is' exponential 'in the GradientBoostingClassifier, the AdaBoost algorithm is applied. AdaBoost makes a weak learner that classifies two classes, but since target is multiclass, ValueError occurs.

To Reproduce
Steps to reproduce the behavior:

  1. Show your code calling generate_code().
script
    cls = SapientML(
        target_columns=["species"],
        add_explanation=True,
        split_train_size=0.75,
        hyperparameter_tuning=True,
        hyperparameter_tuning_n_trials=10,
        hyperparameter_tuning_timeout=600,
    )
    
    model = cls.fit(train_data_all).model
  1. Attach the datasets or dataframes input to generate_code() if possible.
    https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html

  2. Show the generated code such as 1_default.py when it was generated.

generated code
# *** GENERATED PIPELINE ***

# LOAD DATA
import pandas as pd
train_dataset = pd.read_pickle(r"/home/sugawara/PoC/mobilePF/outputs/training.pkl")

# TRAIN-TEST SPLIT
from sklearn.model_selection import train_test_split
def split_dataset(dataset, train_size=0.75, random_state=17):
    train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state)
    return train_dataset, test_dataset	
train_dataset, test_dataset = split_dataset(train_dataset)
train_dataset, validation_dataset = split_dataset(train_dataset)

# SUBSAMPLE
# If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup.
from lib.sample_dataset import sample_dataset
train_dataset = sample_dataset(
    dataframe=train_dataset,
    sample_size=100000,
    target_columns=['species'],
    task_type='classification'
)

test_dataset = validation_dataset


# DETACH TARGET
TARGET_COLUMNS = ['species']
feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
target_train = train_dataset[TARGET_COLUMNS].copy()
feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
target_test = test_dataset[TARGET_COLUMNS].copy()

# HYPERPARAMETER OPTIMIZATION
import optuna
from sklearn.ensemble import GradientBoostingClassifier
# NEED CV: ex.) optuna.integration.OptunaSearchCV()
class Objective(object):
    def __init__(self, feature_train, target_train, feature_test, target_test, __random_state):
        self.feature_train = feature_train
        self.target_train = target_train
        self.feature_test = feature_test
        self.target_test = target_test 
        self.__random_state = __random_state
    def __call__(self, trial):
        def set_hyperparameters(trial):
            params = {}
            params['loss'] =  trial.suggest_categorical('loss', ['log_loss', 'deviance', 'exponential']) # log_loss 
            params['n_estimators'] =  trial.suggest_int('n_estimators', 10, 1000, log=True) # 100
            params['subsample'] = trial.suggest_float('subsample', 0.2, 1) # 1  
            params['criterion'] = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error']) # 'friedman_mse'
            params['min_samples_leaf'] = trial.suggest_int('min_samples_leaf', 1, 32, log=True) # 1
            params['max_features'] = trial.suggest_categorical('max_features', ['sqrt','log2', None]) # None 
            return params
        
        # SET DATA
        import numpy as np
    
        if isinstance(self.feature_train, pd.DataFrame):
            feature_train = self.feature_train
        elif isinstance(self.feature_train, np.ndarray):
            feature_train = pd.DataFrame(self.feature_train)
        else:
            feature_train = pd.DataFrame(self.feature_train.toarray())
    
        if isinstance(self.target_train, pd.DataFrame):
            target_train = self.target_train
        elif isinstance(self.target_train, np.ndarray):
            target_train = pd.DataFrame(self.target_train)
        else:
            target_train = pd.DataFrame(self.target_train.toarray())
    
        if isinstance(self.feature_test, pd.DataFrame):
            feature_test = self.feature_test
        elif isinstance(self.feature_test, np.ndarray):
            feature_test = pd.DataFrame(self.feature_test)
        else:
            feature_test = pd.DataFrame(self.feature_test.toarray())
    
        if isinstance(self.target_test, pd.DataFrame):
            target_test = self.target_test
        elif isinstance(self.target_test, np.ndarray):
            target_test = pd.DataFrame(self.target_test)
        else:
            target_test = pd.DataFrame(self.target_test.toarray())
        # MODEL 
        params = set_hyperparameters(trial)
        model = GradientBoostingClassifier(random_state=self.__random_state, **params)
        model.fit(feature_train, target_train.values.ravel())
        y_pred = model.predict(feature_test)
        
        from sklearn import metrics
        score = metrics.f1_score(target_test, y_pred, average='macro')
        
        return score
    
n_trials = 10
timeout = 600 
random_state = 1023 
random_state_model = 42 
direction = 'maximize' 
    
study = optuna.create_study(direction=direction,
                sampler=optuna.samplers.TPESampler(seed=random_state)) 
default_hyperparameters = {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0}
study.enqueue_trial(default_hyperparameters)
study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model), 
                n_trials=n_trials, 
                timeout=timeout)
best_params = study.best_params
print("best params:", best_params)
print("RESULT: f1: " + str(study.best_value))
  1. Show the messages of SapientML and/or generated code.
ValueError: ExponentialLoss requires 2 classes; got 3 class(es)

Expected behavior

Environment (please complete the following information):

  • SapientML Version: 0.4.12.post0

Additional context
If the target has multiclass, the parameter "loss" must be set to "log_loss" only.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions