Skip to content

Commit 5e9e803

Browse files
committed
Reformatted the code and updated the examples
1 parent 374875b commit 5e9e803

36 files changed

+343
-270
lines changed

category_encoders/backward_difference.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
1717
----------
1818
1919
verbose: int
20-
integer indicating verbosity of output. 0 for none.
20+
integer indicating verbosity of the output. 0 for none.
2121
cols: list
2222
a list of columns to encode, if None, all string columns will be encoded.
2323
drop_invariant: bool
@@ -75,13 +75,12 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
7575
References
7676
----------
7777
78-
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
79-
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
78+
.. [1] Contrast Coding Systems for Categorical Variables, from
79+
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
8080
8181
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
8282
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
8383
84-
8584
"""
8685

8786
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
@@ -282,11 +281,12 @@ def get_feature_names(self):
282281
"""
283282
Returns the names of all transformed / added columns.
284283
285-
Returns:
286-
--------
284+
Returns
285+
-------
287286
feature_names: list
288287
A list with all feature names transformed or added.
289288
Note: potentially dropped features are not included!
289+
290290
"""
291291

292292
if not isinstance(self.feature_names, list):

category_encoders/basen.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
2020
----------
2121
2222
verbose: int
23-
integer indicating verbosity of output. 0 for none.
23+
integer indicating verbosity of the output. 0 for none.
2424
cols: list
2525
a list of columns to encode, if None, all string columns will be encoded.
2626
drop_invariant: bool
@@ -310,9 +310,11 @@ def basen_encode(self, X_in, cols=None):
310310
X_in: DataFrame
311311
cols: list-like, default None
312312
Column names in the DataFrame to be encoded
313+
313314
Returns
314315
-------
315316
dummies : DataFrame
317+
316318
"""
317319

318320
X = X_in.copy(deep=True)
@@ -348,6 +350,7 @@ def basen_to_integer(self, X, cols, base):
348350
Returns
349351
-------
350352
numerical: DataFrame
353+
351354
"""
352355
out_cols = X.columns.values.tolist()
353356

@@ -360,7 +363,7 @@ def basen_to_integer(self, X, cols, base):
360363
else:
361364
len0 = len(col_list)
362365
value_array = np.array([base ** (len0 - 1 - i) for i in range(len0)])
363-
X.insert(insert_at,col,np.dot(X[col_list].values, value_array.T))
366+
X.insert(insert_at, col, np.dot(X[col_list].values, value_array.T))
364367
X.drop(col_list, axis=1, inplace=True)
365368
out_cols = X.columns.values.tolist()
366369

@@ -374,14 +377,14 @@ def col_transform(self, col, digits):
374377
if col is None or float(col) < 0.0:
375378
return None
376379
else:
377-
col = self.numberToBase(int(col), self.base, digits)
380+
col = self.number_to_base(int(col), self.base, digits)
378381
if len(col) == digits:
379382
return col
380383
else:
381384
return [0 for _ in range(digits - len(col))] + col
382385

383386
@staticmethod
384-
def numberToBase(n, b, limit):
387+
def number_to_base(n, b, limit):
385388
if b == 1:
386389
return [0 if n != _ else 1 for _ in range(limit)]
387390

@@ -399,11 +402,12 @@ def get_feature_names(self):
399402
"""
400403
Returns the names of all transformed / added columns.
401404
402-
Returns:
403-
--------
405+
Returns
406+
-------
404407
feature_names: list
405408
A list with all feature names transformed or added.
406409
Note: potentially dropped features are not included!
410+
407411
"""
408412

409413
if not isinstance(self.feature_names, list):

category_encoders/binary.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
1515
----------
1616
1717
verbose: int
18-
integer indicating verbosity of output. 0 for none.
18+
integer indicating verbosity of the output. 0 for none.
1919
cols: list
2020
a list of columns to encode, if None, all string columns will be encoded.
2121
drop_invariant: bool
@@ -144,11 +144,12 @@ def get_feature_names(self):
144144
"""
145145
Returns the names of all transformed / added columns.
146146
147-
Returns:
148-
--------
147+
Returns
148+
-------
149149
feature_names: list
150150
A list with all feature names transformed or added.
151151
Note: potentially dropped features are not included!
152+
152153
"""
153154

154155
return self.base_n_encoder.get_feature_names()

category_encoders/cat_boost.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""CatBoost coding"""
2+
23
import numpy as np
34
import pandas as pd
45
from sklearn.base import BaseEstimator, TransformerMixin
@@ -15,19 +16,21 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
1516
values "on-the-fly". Consequently, the values naturally vary
1617
during the training phase and it is not necessary to add random noise.
1718
18-
Beware, the training data have to be randomly permutated. E.g.:
19+
Beware, the training data have to be randomly permutated. E.g.::
20+
1921
# Random permutation
2022
perm = np.random.permutation(len(X))
2123
X = X.iloc[perm].reset_index(drop=True)
2224
y = y.iloc[perm].reset_index(drop=True)
25+
2326
This is necessary because some datasets are sorted based on the target
2427
value and this coder encodes the features on-the-fly in a single pass.
2528
2629
Parameters
2730
----------
2831
2932
verbose: int
30-
integer indicating verbosity of output. 0 for none.
33+
integer indicating verbosity of the output. 0 for none.
3134
cols: list
3235
a list of columns to encode, if None, all string columns will be encoded.
3336
drop_invariant: bool
@@ -48,7 +51,7 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
4851
>>> bunch = load_boston()
4952
>>> y = bunch.target
5053
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
51-
>>> enc = LeaveOneOutEncoder(cols=['CHAS', 'RAD']).fit(X, y)
54+
>>> enc = CatBoostEncoder(cols=['CHAS', 'RAD']).fit(X, y)
5255
>>> numeric_dataset = enc.transform(X)
5356
>>> print(numeric_dataset.info())
5457
<class 'pandas.core.frame.DataFrame'>
@@ -74,8 +77,9 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
7477
References
7578
----------
7679
77-
.. [1] Transforming categorical features to numerical features. from
78-
https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/.
80+
.. [1] Transforming categorical features to numerical features, from
81+
https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
82+
7983
"""
8084

8185
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
@@ -84,7 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
8488
self.drop_invariant = drop_invariant
8589
self.drop_cols = []
8690
self.verbose = verbose
87-
self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
91+
self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
8892
self.cols = cols
8993
self._dim = None
9094
self.mapping = None
@@ -280,7 +284,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None):
280284
level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean)
281285
X[col] = X[col].map(level_means)
282286
else:
283-
## Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
287+
# Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
284288
# The nice thing about this is that it helps to prevent overfitting. The bad thing
285289
# is that CatBoost uses many iterations over the data. But we run just one iteration.
286290
# Still, it works better than leave-one-out without any noise.
@@ -308,11 +312,12 @@ def get_feature_names(self):
308312
"""
309313
Returns the names of all transformed / added columns.
310314
311-
Returns:
312-
--------
315+
Returns
316+
-------
313317
feature_names: list
314318
A list with all feature names transformed or added.
315319
Note: potentially dropped features are not included!
320+
316321
"""
317322

318323
if not isinstance(self.feature_names, list):

category_encoders/hashing.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class HashingEncoder(BaseEstimator, TransformerMixin):
2020
----------
2121
2222
verbose: int
23-
integer indicating verbosity of output. 0 for none.
23+
integer indicating verbosity of the output. 0 for none.
2424
cols: list
2525
a list of columns to encode, if None, all string columns will be encoded.
2626
drop_invariant: bool
@@ -69,8 +69,8 @@ class HashingEncoder(BaseEstimator, TransformerMixin):
6969
7070
References
7171
----------
72-
.. [1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing for
73-
Large Scale Multitask Learning. Proc. ICML.
72+
.. [1] Feature Hashing for Large Scale Multitask Learning, from
73+
https://alex.smola.org/papers/2009/Weinbergeretal09.pdf
7474
7575
"""
7676

@@ -258,11 +258,12 @@ def get_feature_names(self):
258258
"""
259259
Returns the names of all transformed / added columns.
260260
261-
Returns:
262-
--------
261+
Returns
262+
-------
263263
feature_names: list
264264
A list with all feature names transformed or added.
265265
Note: potentially dropped features are not included!
266+
266267
"""
267268

268269
if not isinstance(self.feature_names, list):

category_encoders/helmert.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class HelmertEncoder(BaseEstimator, TransformerMixin):
1818
----------
1919
2020
verbose: int
21-
integer indicating verbosity of output. 0 for none.
21+
integer indicating verbosity of the output. 0 for none.
2222
cols: list
2323
a list of columns to encode, if None, all string columns will be encoded.
2424
drop_invariant: bool
@@ -76,14 +76,14 @@ class HelmertEncoder(BaseEstimator, TransformerMixin):
7676
References
7777
----------
7878
79-
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
80-
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
79+
.. [1] Contrast Coding Systems for Categorical Variables, from
80+
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
8181
8282
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
8383
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
8484
85-
8685
"""
86+
8787
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
8888
handle_unknown='indicator', handle_missing='indicator'):
8989
self.return_df = return_df
@@ -279,11 +279,12 @@ def get_feature_names(self):
279279
"""
280280
Returns the names of all transformed / added columns.
281281
282-
Returns:
283-
--------
282+
Returns
283+
-------
284284
feature_names: list
285285
A list with all feature names transformed or added.
286286
Note: potentially dropped features are not included!
287+
287288
"""
288289

289290
if not isinstance(self.feature_names, list):

0 commit comments

Comments
 (0)