11"""CatBoost coding"""
2+
23import numpy as np
34import pandas as pd
45from sklearn .base import BaseEstimator , TransformerMixin
@@ -15,19 +16,21 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
1516 values "on-the-fly". Consequently, the values naturally vary
1617 during the training phase and it is not necessary to add random noise.
1718
18- Beware, the training data have to be randomly permutated. E.g.:
19+ Beware, the training data have to be randomly permutated. E.g.::
20+
1921 # Random permutation
2022 perm = np.random.permutation(len(X))
2123 X = X.iloc[perm].reset_index(drop=True)
2224 y = y.iloc[perm].reset_index(drop=True)
25+
2326 This is necessary because some datasets are sorted based on the target
2427 value and this coder encodes the features on-the-fly in a single pass.
2528
2629 Parameters
2730 ----------
2831
2932 verbose: int
30- integer indicating verbosity of output. 0 for none.
33+ integer indicating verbosity of the output. 0 for none.
3134 cols: list
3235 a list of columns to encode, if None, all string columns will be encoded.
3336 drop_invariant: bool
@@ -48,7 +51,7 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
4851 >>> bunch = load_boston()
4952 >>> y = bunch.target
5053 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
51- >>> enc = LeaveOneOutEncoder (cols=['CHAS', 'RAD']).fit(X, y)
54+ >>> enc = CatBoostEncoder (cols=['CHAS', 'RAD']).fit(X, y)
5255 >>> numeric_dataset = enc.transform(X)
5356 >>> print(numeric_dataset.info())
5457 <class 'pandas.core.frame.DataFrame'>
@@ -74,8 +77,9 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
7477 References
7578 ----------
7679
77- .. [1] Transforming categorical features to numerical features. from
78- https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/.
80+ .. [1] Transforming categorical features to numerical features, from
81+ https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
82+
7983 """
8084
8185 def __init__ (self , verbose = 0 , cols = None , drop_invariant = False , return_df = True ,
@@ -84,7 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
8488 self .drop_invariant = drop_invariant
8589 self .drop_cols = []
8690 self .verbose = verbose
87- self .use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
91+ self .use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
8892 self .cols = cols
8993 self ._dim = None
9094 self .mapping = None
@@ -280,7 +284,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None):
280284 level_means = ((colmap ['sum' ] + self ._mean ) / (colmap ['count' ] + 1 )).where (level_notunique , self ._mean )
281285 X [col ] = X [col ].map (level_means )
282286 else :
283- ## Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
287+ # Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
284288 # The nice thing about this is that it helps to prevent overfitting. The bad thing
285289 # is that CatBoost uses many iterations over the data. But we run just one iteration.
286290 # Still, it works better than leave-one-out without any noise.
@@ -308,11 +312,12 @@ def get_feature_names(self):
308312 """
309313 Returns the names of all transformed / added columns.
310314
311- Returns:
312- --------
315+ Returns
316+ -------
313317 feature_names: list
314318 A list with all feature names transformed or added.
315319 Note: potentially dropped features are not included!
320+
316321 """
317322
318323 if not isinstance (self .feature_names , list ):
0 commit comments