Skip to content

Commit 21d64ab

Browse files
fixed doctests
1 parent 8f7628d commit 21d64ab

20 files changed

+476
-524
lines changed

category_encoders/backward_difference.py

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -35,39 +35,33 @@ class BackwardDifferenceEncoder(BaseContrastEncoder):
3535
-------
3636
>>> from category_encoders import *
3737
>>> import pandas as pd
38-
>>> from sklearn.datasets import load_boston
39-
>>> bunch = load_boston()
38+
>>> from sklearn.datasets import fetch_openml
39+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
40+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
4041
>>> y = bunch.target
41-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
42-
>>> enc = BackwardDifferenceEncoder(cols=['CHAS', 'RAD']).fit(X, y)
42+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
43+
>>> enc = BackwardDifferenceEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
4344
>>> numeric_dataset = enc.transform(X)
4445
>>> print(numeric_dataset.info())
4546
<class 'pandas.core.frame.DataFrame'>
46-
RangeIndex: 506 entries, 0 to 505
47-
Data columns (total 21 columns):
48-
intercept 506 non-null int64
49-
CRIM 506 non-null float64
50-
ZN 506 non-null float64
51-
INDUS 506 non-null float64
52-
CHAS_0 506 non-null float64
53-
NOX 506 non-null float64
54-
RM 506 non-null float64
55-
AGE 506 non-null float64
56-
DIS 506 non-null float64
57-
RAD_0 506 non-null float64
58-
RAD_1 506 non-null float64
59-
RAD_2 506 non-null float64
60-
RAD_3 506 non-null float64
61-
RAD_4 506 non-null float64
62-
RAD_5 506 non-null float64
63-
RAD_6 506 non-null float64
64-
RAD_7 506 non-null float64
65-
TAX 506 non-null float64
66-
PTRATIO 506 non-null float64
67-
B 506 non-null float64
68-
LSTAT 506 non-null float64
69-
dtypes: float64(20), int64(1)
70-
memory usage: 83.1 KB
47+
RangeIndex: 1460 entries, 0 to 1459
48+
Data columns (total 12 columns):
49+
# Column Non-Null Count Dtype
50+
--- ------ -------------- -----
51+
0 intercept 1460 non-null int64
52+
1 Id 1460 non-null float64
53+
2 MSSubClass 1460 non-null float64
54+
3 MSZoning 1460 non-null object
55+
4 LotFrontage 1201 non-null float64
56+
5 YearBuilt 1460 non-null float64
57+
6 Heating_0 1460 non-null float64
58+
7 Heating_1 1460 non-null float64
59+
8 Heating_2 1460 non-null float64
60+
9 Heating_3 1460 non-null float64
61+
10 Heating_4 1460 non-null float64
62+
11 CentralAir_0 1460 non-null float64
63+
dtypes: float64(10), int64(1), object(1)
64+
memory usage: 137.0+ KB
7165
None
7266
7367
References

category_encoders/basen.py

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -62,36 +62,31 @@ class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
6262
-------
6363
>>> from category_encoders import *
6464
>>> import pandas as pd
65-
>>> from sklearn.datasets import load_boston
66-
>>> bunch = load_boston()
65+
>>> from sklearn.datasets import fetch_openml
66+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
67+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
6768
>>> y = bunch.target
68-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
69-
>>> enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
69+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
70+
>>> enc = BaseNEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
7071
>>> numeric_dataset = enc.transform(X)
7172
>>> print(numeric_dataset.info())
7273
<class 'pandas.core.frame.DataFrame'>
73-
RangeIndex: 506 entries, 0 to 505
74-
Data columns (total 18 columns):
75-
CRIM 506 non-null float64
76-
ZN 506 non-null float64
77-
INDUS 506 non-null float64
78-
CHAS_0 506 non-null int64
79-
CHAS_1 506 non-null int64
80-
NOX 506 non-null float64
81-
RM 506 non-null float64
82-
AGE 506 non-null float64
83-
DIS 506 non-null float64
84-
RAD_0 506 non-null int64
85-
RAD_1 506 non-null int64
86-
RAD_2 506 non-null int64
87-
RAD_3 506 non-null int64
88-
RAD_4 506 non-null int64
89-
TAX 506 non-null float64
90-
PTRATIO 506 non-null float64
91-
B 506 non-null float64
92-
LSTAT 506 non-null float64
93-
dtypes: float64(11), int64(7)
94-
memory usage: 71.3 KB
74+
RangeIndex: 1460 entries, 0 to 1459
75+
Data columns (total 10 columns):
76+
# Column Non-Null Count Dtype
77+
--- ------ -------------- -----
78+
0 Id 1460 non-null float64
79+
1 MSSubClass 1460 non-null float64
80+
2 MSZoning 1460 non-null object
81+
3 LotFrontage 1201 non-null float64
82+
4 YearBuilt 1460 non-null float64
83+
5 Heating_0 1460 non-null int64
84+
6 Heating_1 1460 non-null int64
85+
7 Heating_2 1460 non-null int64
86+
8 CentralAir_0 1460 non-null int64
87+
9 CentralAir_1 1460 non-null int64
88+
dtypes: float64(4), int64(5), object(1)
89+
memory usage: 114.2+ KB
9590
None
9691
9792
"""

category_encoders/binary.py

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,36 +33,31 @@ class BinaryEncoder(BaseNEncoder):
3333
-------
3434
>>> from category_encoders import *
3535
>>> import pandas as pd
36-
>>> from sklearn.datasets import load_boston
37-
>>> bunch = load_boston()
36+
>>> from sklearn.datasets import fetch_openml
37+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
38+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
3839
>>> y = bunch.target
39-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
40-
>>> enc = BinaryEncoder(cols=['CHAS', 'RAD']).fit(X, y)
40+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
41+
>>> enc = BinaryEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
4142
>>> numeric_dataset = enc.transform(X)
4243
>>> print(numeric_dataset.info())
4344
<class 'pandas.core.frame.DataFrame'>
44-
RangeIndex: 506 entries, 0 to 505
45-
Data columns (total 18 columns):
46-
CRIM 506 non-null float64
47-
ZN 506 non-null float64
48-
INDUS 506 non-null float64
49-
CHAS_0 506 non-null int64
50-
CHAS_1 506 non-null int64
51-
NOX 506 non-null float64
52-
RM 506 non-null float64
53-
AGE 506 non-null float64
54-
DIS 506 non-null float64
55-
RAD_0 506 non-null int64
56-
RAD_1 506 non-null int64
57-
RAD_2 506 non-null int64
58-
RAD_3 506 non-null int64
59-
RAD_4 506 non-null int64
60-
TAX 506 non-null float64
61-
PTRATIO 506 non-null float64
62-
B 506 non-null float64
63-
LSTAT 506 non-null float64
64-
dtypes: float64(11), int64(7)
65-
memory usage: 71.3 KB
45+
RangeIndex: 1460 entries, 0 to 1459
46+
Data columns (total 10 columns):
47+
# Column Non-Null Count Dtype
48+
--- ------ -------------- -----
49+
0 Id 1460 non-null float64
50+
1 MSSubClass 1460 non-null float64
51+
2 MSZoning 1460 non-null object
52+
3 LotFrontage 1201 non-null float64
53+
4 YearBuilt 1460 non-null float64
54+
5 Heating_0 1460 non-null int64
55+
6 Heating_1 1460 non-null int64
56+
7 Heating_2 1460 non-null int64
57+
8 CentralAir_0 1460 non-null int64
58+
9 CentralAir_1 1460 non-null int64
59+
dtypes: float64(4), int64(5), object(1)
60+
memory usage: 114.2+ KB
6661
None
6762
6863
"""

category_encoders/cat_boost.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -54,31 +54,28 @@ class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
5454
-------
5555
>>> from category_encoders import *
5656
>>> import pandas as pd
57-
>>> from sklearn.datasets import load_boston
58-
>>> bunch = load_boston()
57+
>>> from sklearn.datasets import fetch_openml
58+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
59+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
5960
>>> y = bunch.target
60-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
61-
>>> enc = CatBoostEncoder(cols=['CHAS', 'RAD']).fit(X, y)
61+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
62+
>>> enc = CatBoostEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
6263
>>> numeric_dataset = enc.transform(X)
6364
>>> print(numeric_dataset.info())
6465
<class 'pandas.core.frame.DataFrame'>
65-
RangeIndex: 506 entries, 0 to 505
66-
Data columns (total 13 columns):
67-
CRIM 506 non-null float64
68-
ZN 506 non-null float64
69-
INDUS 506 non-null float64
70-
CHAS 506 non-null float64
71-
NOX 506 non-null float64
72-
RM 506 non-null float64
73-
AGE 506 non-null float64
74-
DIS 506 non-null float64
75-
RAD 506 non-null float64
76-
TAX 506 non-null float64
77-
PTRATIO 506 non-null float64
78-
B 506 non-null float64
79-
LSTAT 506 non-null float64
80-
dtypes: float64(13)
81-
memory usage: 51.5 KB
66+
RangeIndex: 1460 entries, 0 to 1459
67+
Data columns (total 7 columns):
68+
# Column Non-Null Count Dtype
69+
--- ------ -------------- -----
70+
0 Id 1460 non-null float64
71+
1 MSSubClass 1460 non-null float64
72+
2 MSZoning 1460 non-null object
73+
3 LotFrontage 1201 non-null float64
74+
4 YearBuilt 1460 non-null float64
75+
5 Heating 1460 non-null float64
76+
6 CentralAir 1460 non-null float64
77+
dtypes: float64(6), object(1)
78+
memory usage: 80.0+ KB
8279
None
8380
8481
References

category_encoders/count.py

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -71,34 +71,30 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
7171
Example
7272
-------
7373
>>> import pandas as pd
74-
>>> from sklearn.datasets import load_boston
74+
>>> from sklearn.datasets import fetch_openml
7575
>>> from category_encoders import CountEncoder
7676
77-
>>> bunch = load_boston()
77+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
78+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
7879
>>> y = bunch.target
79-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
80-
>>> enc = CountEncoder(cols=['CHAS', 'RAD']).fit(X, y)
80+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
81+
>>> enc = CountEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
8182
>>> numeric_dataset = enc.transform(X)
82-
8383
>>> print(numeric_dataset.info())
8484
<class 'pandas.core.frame.DataFrame'>
85-
RangeIndex: 506 entries, 0 to 505
86-
Data columns (total 13 columns):
87-
CRIM 506 non-null float64
88-
ZN 506 non-null float64
89-
INDUS 506 non-null float64
90-
CHAS 506 non-null int64
91-
NOX 506 non-null float64
92-
RM 506 non-null float64
93-
AGE 506 non-null float64
94-
DIS 506 non-null float64
95-
RAD 506 non-null int64
96-
TAX 506 non-null float64
97-
PTRATIO 506 non-null float64
98-
B 506 non-null float64
99-
LSTAT 506 non-null float64
100-
dtypes: float64(11), int64(2)
101-
memory usage: 51.5 KB
85+
RangeIndex: 1460 entries, 0 to 1459
86+
Data columns (total 7 columns):
87+
# Column Non-Null Count Dtype
88+
--- ------ -------------- -----
89+
0 Id 1460 non-null float64
90+
1 MSSubClass 1460 non-null float64
91+
2 MSZoning 1460 non-null object
92+
3 LotFrontage 1201 non-null float64
93+
4 YearBuilt 1460 non-null float64
94+
5 Heating 1460 non-null int64
95+
6 CentralAir 1460 non-null int64
96+
dtypes: float64(4), int64(2), object(1)
97+
memory usage: 80.0+ KB
10298
None
10399
104100
References

category_encoders/glmm.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -60,31 +60,28 @@ class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
6060
-------
6161
>>> from category_encoders import *
6262
>>> import pandas as pd
63-
>>> from sklearn.datasets import load_boston
64-
>>> bunch = load_boston()
65-
>>> y = bunch.target > 22.5
66-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
67-
>>> enc = GLMMEncoder(cols=['CHAS', 'RAD']).fit(X, y)
63+
>>> from sklearn.datasets import fetch_openml
64+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
65+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
66+
>>> y = bunch.target > 200000
67+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
68+
>>> enc = GLMMEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
6869
>>> numeric_dataset = enc.transform(X)
6970
>>> print(numeric_dataset.info())
7071
<class 'pandas.core.frame.DataFrame'>
71-
RangeIndex: 506 entries, 0 to 505
72-
Data columns (total 13 columns):
73-
CRIM 506 non-null float64
74-
ZN 506 non-null float64
75-
INDUS 506 non-null float64
76-
CHAS 506 non-null float64
77-
NOX 506 non-null float64
78-
RM 506 non-null float64
79-
AGE 506 non-null float64
80-
DIS 506 non-null float64
81-
RAD 506 non-null float64
82-
TAX 506 non-null float64
83-
PTRATIO 506 non-null float64
84-
B 506 non-null float64
85-
LSTAT 506 non-null float64
86-
dtypes: float64(13)
87-
memory usage: 51.5 KB
72+
RangeIndex: 1460 entries, 0 to 1459
73+
Data columns (total 7 columns):
74+
# Column Non-Null Count Dtype
75+
--- ------ -------------- -----
76+
0 Id 1460 non-null float64
77+
1 MSSubClass 1460 non-null float64
78+
2 MSZoning 1460 non-null object
79+
3 LotFrontage 1201 non-null float64
80+
4 YearBuilt 1460 non-null float64
81+
5 Heating 1460 non-null float64
82+
6 CentralAir 1460 non-null float64
83+
dtypes: float64(6), object(1)
84+
memory usage: 80.0+ KB
8885
None
8986
9087
References

category_encoders/gray.py

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -40,36 +40,31 @@ class GrayEncoder(BaseNEncoder):
4040
-------
4141
>>> from category_encoders import GrayEncoder
4242
>>> import pandas as pd
43-
>>> from sklearn.datasets import load_boston
44-
>>> bunch = load_boston()
43+
>>> from sklearn.datasets import fetch_openml
44+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
45+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
4546
>>> y = bunch.target
46-
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names_out_)
47-
>>> enc = GrayEncoder(cols=['CHAS', 'RAD']).fit(X, y)
47+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
48+
>>> enc = GrayEncoder(cols=['CentralAir', 'Heating']).fit(X, y)
4849
>>> numeric_dataset = enc.transform(X)
4950
>>> print(numeric_dataset.info())
5051
<class 'pandas.core.frame.DataFrame'>
51-
RangeIndex: 506 entries, 0 to 505
52-
Data columns (total 18 columns):
53-
CRIM 506 non-null float64
54-
ZN 506 non-null float64
55-
INDUS 506 non-null float64
56-
CHAS_0 506 non-null int64
57-
CHAS_1 506 non-null int64
58-
NOX 506 non-null float64
59-
RM 506 non-null float64
60-
AGE 506 non-null float64
61-
DIS 506 non-null float64
62-
RAD_0 506 non-null int64
63-
RAD_1 506 non-null int64
64-
RAD_2 506 non-null int64
65-
RAD_3 506 non-null int64
66-
RAD_4 506 non-null int64
67-
TAX 506 non-null float64
68-
PTRATIO 506 non-null float64
69-
B 506 non-null float64
70-
LSTAT 506 non-null float64
71-
dtypes: float64(11), int64(7)
72-
memory usage: 71.3 KB
52+
RangeIndex: 1460 entries, 0 to 1459
53+
Data columns (total 10 columns):
54+
# Column Non-Null Count Dtype
55+
--- ------ -------------- -----
56+
0 Id 1460 non-null float64
57+
1 MSSubClass 1460 non-null float64
58+
2 MSZoning 1460 non-null object
59+
3 LotFrontage 1201 non-null float64
60+
4 YearBuilt 1460 non-null float64
61+
5 Heating_0 1460 non-null int64
62+
6 Heating_1 1460 non-null int64
63+
7 Heating_2 1460 non-null int64
64+
8 CentralAir_0 1460 non-null int64
65+
9 CentralAir_1 1460 non-null int64
66+
dtypes: float64(4), int64(5), object(1)
67+
memory usage: 114.2+ KB
7368
None
7469
7570
References

0 commit comments

Comments
 (0)