Skip to content

Commit c6de169

Browse files
author
mausam.singh
committed
Add label encoder for supervised transform
Remove converting to float in catboost
1 parent 6138f39 commit c6de169

File tree

3 files changed

+20
-5
lines changed

3 files changed

+20
-5
lines changed

category_encoders/cat_boost.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
103103

104104
def _fit(self, X, y, **kwargs):
105105
X = X.copy(deep=True)
106-
y = y.astype(float) #Incase y is bool or categorical.
107106
self._mean = y.mean()
108107
self.mapping = {col: self._fit_column_map(X[col], y) for col in self.cols}
109108

category_encoders/utils.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
import pandas as pd
77
import numpy as np
88
import sklearn.base
9-
from pandas.api.types import is_object_dtype, is_string_dtype
9+
from pandas.api.types import is_object_dtype, is_string_dtype, is_numeric_dtype
1010
from pandas.core.dtypes.dtypes import CategoricalDtype
1111
from sklearn.base import BaseEstimator, TransformerMixin
1212
from sklearn.exceptions import NotFittedError
1313
from typing import Dict, List, Optional, Union
1414
from scipy.sparse import csr_matrix
15+
from sklearn.preprocessing import LabelEncoder
1516

1617
__author__ = 'willmcginnis'
1718

@@ -294,11 +295,18 @@ def fit(self, X, y=None, **kwargs):
294295
Returns self.
295296
296297
"""
297-
self._check_fit_inputs(X, y)
298298
X, y = convert_inputs(X, y)
299+
self._check_fit_inputs(X, y)
299300
self.feature_names_in_ = X.columns.tolist()
300301
self.n_features_in_ = len(self.feature_names_in_)
301302

303+
if self._get_tags().get('supervised_encoder'):
304+
if not is_numeric_dtype(y):
305+
self.lab_encoder_ = LabelEncoder()
306+
y = self.lab_encoder_.fit_transform(y)
307+
else:
308+
self.lab_encoder_ = None
309+
302310
self._dim = X.shape[1]
303311
self._determine_fit_columns(X)
304312

@@ -324,8 +332,12 @@ def fit(self, X, y=None, **kwargs):
324332
return self
325333

326334
def _check_fit_inputs(self, X, y):
327-
if self._get_tags().get('supervised_encoder') and y is None:
328-
raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None')
335+
if self._get_tags().get('supervised_encoder'):
336+
if y is None:
337+
raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None')
338+
else:
339+
if y.isna().any(): # Target column should never have missing values
340+
raise ValueError("The target column y must not contain missing values.")
329341

330342
def _check_transform_inputs(self, X):
331343
if self.handle_missing == 'error':
@@ -435,6 +447,8 @@ def transform(self, X, y=None, override_return_df=False):
435447
# first check the type
436448
X, y = convert_inputs(X, y, deep=True)
437449
self._check_transform_inputs(X)
450+
if y is not None and self.lab_encoder_ is not None:
451+
y = self.lab_encoder_.transform(y)
438452

439453
if not list(self.cols):
440454
return X

category_encoders/woe.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from category_encoders.ordinal import OrdinalEncoder
44
import category_encoders.utils as util
55
from sklearn.utils.random import check_random_state
6+
import pandas as pd
67

78
__author__ = 'Jan Motl'
89

@@ -87,6 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
8788

8889
def _fit(self, X, y, **kwargs):
8990
# The label must be binary with values {0,1}
91+
y = pd.Series(y)
9092
unique = y.unique()
9193
if len(unique) != 2:
9294
raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).")

0 commit comments

Comments
 (0)