Skip to content

Commit 81bb01d

Browse files
Merge pull request #369 from PaulWestenthanner/refactor/contrast_coding
added base contrast encoder
2 parents beb48b9 + b4f5d49 commit 81bb01d

File tree

6 files changed

+176
-404
lines changed

6 files changed

+176
-404
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
unreleased
2+
==========
3+
* Added base class for contrast coding schemes in order to make them more maintainable
4+
* Added hierarchical column feature in target encoder
5+
* Fixed maximum recursion depth bug in hashing encoder
6+
17
v2.5.0
28
======
39

Lines changed: 7 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
"""Backward difference contrast encoding"""
22

3-
import pandas as pd
4-
from patsy.contrasts import Diff
3+
from patsy.contrasts import Diff, ContrastMatrix
54
import numpy as np
6-
from category_encoders.ordinal import OrdinalEncoder
7-
import category_encoders.utils as util
85

9-
__author__ = 'willmcginnis'
6+
from category_encoders.base_contrast_encoder import BaseContrastEncoder
107

8+
__author__ = 'paulwestenthanner'
119

12-
class BackwardDifferenceEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
10+
11+
class BackwardDifferenceEncoder(BaseContrastEncoder):
1312
"""Backward difference contrast coding for encoding categorical variables.
1413
1514
Parameters
@@ -81,98 +80,7 @@ class BackwardDifferenceEncoder(util.BaseEncoder, util.UnsupervisedTransformerMi
8180
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
8281
8382
"""
84-
prefit_ordinal = True
85-
encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE
86-
87-
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
88-
handle_unknown='value', handle_missing='value'):
89-
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
90-
handle_unknown=handle_unknown, handle_missing=handle_missing)
91-
self.mapping = mapping
92-
self.ordinal_encoder = None
93-
94-
def _fit(self, X, y=None, **kwargs):
95-
# train an ordinal pre-encoder
96-
self.ordinal_encoder = OrdinalEncoder(
97-
verbose=self.verbose,
98-
cols=self.cols,
99-
handle_unknown='value',
100-
handle_missing='value'
101-
)
102-
self.ordinal_encoder = self.ordinal_encoder.fit(X)
103-
104-
ordinal_mapping = self.ordinal_encoder.category_mapping
105-
106-
mappings_out = []
107-
for switch in ordinal_mapping:
108-
values = switch.get('mapping')
109-
col = switch.get('col')
110-
111-
column_mapping = self.fit_backward_difference_coding(col, values, self.handle_missing, self.handle_unknown)
112-
mappings_out.append({'col': col, 'mapping': column_mapping, })
113-
114-
self.mapping = mappings_out
115-
116-
def _transform(self, X) -> pd.DataFrame:
117-
X = self.ordinal_encoder.transform(X)
118-
if self.handle_unknown == 'error':
119-
if X[self.cols].isin([-1]).any().any():
120-
raise ValueError('Columns to be encoded can not contain new values')
121-
122-
X = self.backward_difference_coding(X, mapping=self.mapping)
123-
return X
124-
125-
@staticmethod
126-
def fit_backward_difference_coding(col, values, handle_missing, handle_unknown):
127-
if handle_missing == 'value':
128-
values = values[values > 0]
129-
130-
values_to_encode = values.values
131-
132-
if len(values) < 2:
133-
return pd.DataFrame(index=values_to_encode)
134-
135-
if handle_unknown == 'indicator':
136-
values_to_encode = np.append(values_to_encode, -1)
137-
138-
backwards_difference_matrix = Diff().code_without_intercept(values_to_encode)
139-
df = pd.DataFrame(data=backwards_difference_matrix.matrix, index=values_to_encode,
140-
columns=[f"{col}_{i}" for i in range(len(backwards_difference_matrix.column_suffixes))])
141-
142-
if handle_unknown == 'return_nan':
143-
df.loc[-1] = np.nan
144-
elif handle_unknown == 'value':
145-
df.loc[-1] = np.zeros(len(values_to_encode) - 1)
146-
147-
if handle_missing == 'return_nan':
148-
df.loc[values.loc[np.nan]] = np.nan
149-
elif handle_missing == 'value':
150-
df.loc[-2] = np.zeros(len(values_to_encode) - 1)
151-
152-
return df
153-
154-
@staticmethod
155-
def backward_difference_coding(X_in, mapping):
156-
"""
157-
"""
158-
159-
X = X_in.copy(deep=True)
160-
161-
cols = X.columns.values.tolist()
162-
163-
X['intercept'] = pd.Series([1] * X.shape[0], index=X.index)
164-
165-
for switch in mapping:
166-
col = switch.get('col')
167-
mod = switch.get('mapping')
168-
169-
base_df = mod.reindex(X[col])
170-
base_df.set_index(X.index, inplace=True)
171-
X = pd.concat([base_df, X], axis=1)
172-
173-
old_column_index = cols.index(col)
174-
cols[old_column_index: old_column_index + 1] = mod.columns
17583

176-
cols = ['intercept'] + cols
84+
def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix:
85+
return Diff().code_without_intercept(values_to_encode)
17786

178-
return X.reindex(columns=cols)
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
"""Base encoder for various contrast coding schemes"""
2+
from abc import abstractmethod
3+
4+
import pandas as pd
5+
from patsy.contrasts import ContrastMatrix
6+
import numpy as np
7+
from category_encoders.ordinal import OrdinalEncoder
8+
import category_encoders.utils as util
9+
10+
__author__ = 'paulwestenthanner'
11+
12+
13+
class BaseContrastEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
14+
"""Base class for various contrast encoders
15+
16+
Parameters
17+
----------
18+
19+
verbose: int
20+
integer indicating verbosity of the output. 0 for none.
21+
cols: list
22+
a list of columns to encode, if None, all string columns will be encoded.
23+
drop_invariant: bool
24+
boolean for whether or not to drop columns with 0 variance.
25+
return_df: bool
26+
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
27+
handle_unknown: str
28+
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
29+
an extra column will be added in if the transform matrix has unknown categories. This can cause
30+
unexpected changes in dimension in some cases.
31+
handle_missing: str
32+
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
33+
an extra column will be added in if the transform matrix has nan values. This can cause
34+
unexpected changes in dimension in some cases.
35+
36+
References
37+
----------
38+
39+
.. [1] Contrast Coding Systems for Categorical Variables, from
40+
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
41+
42+
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
43+
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
44+
45+
"""
46+
prefit_ordinal = True
47+
encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE
48+
49+
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
50+
handle_unknown='value', handle_missing='value'):
51+
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
52+
handle_unknown=handle_unknown, handle_missing=handle_missing)
53+
self.mapping = mapping
54+
self.ordinal_encoder = None
55+
56+
def _fit(self, X, y=None, **kwargs):
57+
# train an ordinal pre-encoder
58+
self.ordinal_encoder = OrdinalEncoder(
59+
verbose=self.verbose,
60+
cols=self.cols,
61+
handle_unknown='value',
62+
handle_missing='value'
63+
)
64+
self.ordinal_encoder = self.ordinal_encoder.fit(X)
65+
66+
ordinal_mapping = self.ordinal_encoder.category_mapping
67+
68+
mappings_out = []
69+
for switch in ordinal_mapping:
70+
values = switch.get('mapping')
71+
col = switch.get('col')
72+
73+
column_mapping = self.fit_contrast_coding(col, values, self.handle_missing, self.handle_unknown)
74+
mappings_out.append({'col': col, 'mapping': column_mapping, })
75+
76+
self.mapping = mappings_out
77+
78+
def _transform(self, X) -> pd.DataFrame:
79+
X = self.ordinal_encoder.transform(X)
80+
if self.handle_unknown == 'error':
81+
if X[self.cols].isin([-1]).any().any():
82+
raise ValueError('Columns to be encoded can not contain new values')
83+
84+
X = self.transform_contrast_coding(X, mapping=self.mapping)
85+
return X
86+
87+
@abstractmethod
88+
def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix:
89+
raise NotImplementedError
90+
91+
def fit_contrast_coding(self, col, values, handle_missing, handle_unknown):
92+
if handle_missing == 'value':
93+
values = values[values > 0]
94+
95+
values_to_encode = values.values
96+
97+
if len(values) < 2:
98+
return pd.DataFrame(index=values_to_encode)
99+
100+
if handle_unknown == 'indicator':
101+
values_to_encode = np.append(values_to_encode, -1)
102+
103+
contrast_matrix = self.get_contrast_matrix(values_to_encode)
104+
df = pd.DataFrame(data=contrast_matrix.matrix, index=values_to_encode,
105+
columns=[f"{col}_{i}" for i in range(len(contrast_matrix.column_suffixes))])
106+
107+
if handle_unknown == 'return_nan':
108+
df.loc[-1] = np.nan
109+
elif handle_unknown == 'value':
110+
df.loc[-1] = np.zeros(len(values_to_encode) - 1)
111+
112+
if handle_missing == 'return_nan':
113+
df.loc[values.loc[np.nan]] = np.nan
114+
elif handle_missing == 'value':
115+
df.loc[-2] = np.zeros(len(values_to_encode) - 1)
116+
117+
return df
118+
119+
@staticmethod
120+
def transform_contrast_coding(X, mapping):
121+
cols = X.columns.values.tolist()
122+
123+
# See issue 370 if it is necessary to add an intercept or not.
124+
X['intercept'] = pd.Series([1] * X.shape[0], index=X.index)
125+
126+
for switch in mapping:
127+
col = switch.get('col')
128+
mod = switch.get('mapping')
129+
130+
# reindex actually applies the mapping
131+
base_df = mod.reindex(X[col])
132+
base_df.set_index(X.index, inplace=True)
133+
X = pd.concat([base_df, X], axis=1)
134+
135+
old_column_index = cols.index(col)
136+
cols[old_column_index: old_column_index + 1] = mod.columns
137+
138+
# this could lead to problems if an intercept column is already present
139+
# (e.g. if another column has been encoded with another contrast coding scheme)
140+
cols = ['intercept'] + cols
141+
142+
return X.reindex(columns=cols)

category_encoders/helmert.py

Lines changed: 7 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
"""Helmert contrast coding"""
22

33

4-
import pandas as pd
4+
from patsy.contrasts import ContrastMatrix, Helmert
55
import numpy as np
6-
from patsy.contrasts import Helmert
7-
from category_encoders.ordinal import OrdinalEncoder
8-
import category_encoders.utils as util
96

10-
__author__ = 'willmcginnis'
7+
from category_encoders.base_contrast_encoder import BaseContrastEncoder
118

9+
__author__ = 'paulwestenthanner'
1210

13-
class HelmertEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
11+
12+
class HelmertEncoder(BaseContrastEncoder):
1413
"""Helmert contrast coding for encoding categorical features.
1514
1615
Parameters
@@ -82,99 +81,5 @@ class HelmertEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
8281
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
8382
8483
"""
85-
prefit_ordinal = True
86-
encoding_relation = util.EncodingRelation.ONE_TO_ONE
87-
88-
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
89-
handle_unknown='value', handle_missing='value'):
90-
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
91-
handle_unknown=handle_unknown, handle_missing=handle_missing)
92-
self.mapping = mapping
93-
self.ordinal_encoder = None
94-
95-
def _fit(self, X, y=None, **kwargs):
96-
97-
self.ordinal_encoder = OrdinalEncoder(
98-
verbose=self.verbose,
99-
cols=self.cols,
100-
handle_unknown='value',
101-
handle_missing='value'
102-
)
103-
self.ordinal_encoder = self.ordinal_encoder.fit(X)
104-
105-
ordinal_mapping = self.ordinal_encoder.category_mapping
106-
107-
mappings_out = []
108-
for switch in ordinal_mapping:
109-
values = switch.get('mapping')
110-
col = switch.get('col')
111-
112-
column_mapping = self.fit_helmert_coding(col, values, self.handle_missing, self.handle_unknown)
113-
mappings_out.append({'col': col, 'mapping': column_mapping, })
114-
115-
self.mapping = mappings_out
116-
117-
def _transform(self, X):
118-
X = self.ordinal_encoder.transform(X)
119-
120-
if self.handle_unknown == 'error':
121-
if X[self.cols].isin([-1]).any().any():
122-
raise ValueError('Columns to be encoded can not contain new values')
123-
124-
X = self.helmert_coding(X, mapping=self.mapping)
125-
return X
126-
127-
@staticmethod
128-
def fit_helmert_coding(col, values, handle_missing, handle_unknown):
129-
if handle_missing == 'value':
130-
values = values[values > 0]
131-
132-
values_to_encode = values.values
133-
134-
if len(values) < 2:
135-
return pd.DataFrame(index=values_to_encode)
136-
137-
if handle_unknown == 'indicator':
138-
values_to_encode = np.append(values_to_encode, -1)
139-
140-
helmert_contrast_matrix = Helmert().code_without_intercept(values_to_encode)
141-
df = pd.DataFrame(data=helmert_contrast_matrix.matrix, index=values_to_encode,
142-
columns=[f"{col}_{i}" for i in range(len(helmert_contrast_matrix.column_suffixes))])
143-
144-
if handle_unknown == 'return_nan':
145-
df.loc[-1] = np.nan
146-
elif handle_unknown == 'value':
147-
df.loc[-1] = np.zeros(len(values_to_encode) - 1)
148-
149-
if handle_missing == 'return_nan':
150-
df.loc[values.loc[np.nan]] = np.nan
151-
elif handle_missing == 'value':
152-
df.loc[-2] = np.zeros(len(values_to_encode) - 1)
153-
154-
return df
155-
156-
@staticmethod
157-
def helmert_coding(X_in, mapping):
158-
"""
159-
"""
160-
161-
X = X_in.copy(deep=True)
162-
163-
cols = X.columns.values.tolist()
164-
165-
X['intercept'] = pd.Series([1] * X.shape[0], index=X.index)
166-
167-
for switch in mapping:
168-
col = switch.get('col')
169-
mod = switch.get('mapping')
170-
171-
base_df = mod.reindex(X[col])
172-
base_df.set_index(X.index, inplace=True)
173-
X = pd.concat([base_df, X], axis=1)
174-
175-
old_column_index = cols.index(col)
176-
cols[old_column_index: old_column_index + 1] = mod.columns
177-
178-
cols = ['intercept'] + cols
179-
180-
return X.reindex(columns=cols)
84+
def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix:
85+
return Helmert().code_without_intercept(values_to_encode)

0 commit comments

Comments
 (0)