Skip to content

Commit c189587

Browse files
Merge pull request #222 from Rishoban/rankhot_dev
Rankhot dev
2 parents a745057 + d28b409 commit c189587

File tree

7 files changed

+567
-106
lines changed

7 files changed

+567
-106
lines changed

category_encoders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from category_encoders.m_estimate import MEstimateEncoder
2424
from category_encoders.james_stein import JamesSteinEncoder
2525
from category_encoders.cat_boost import CatBoostEncoder
26+
from category_encoders.rankhot import RankHotEncoder
2627
from category_encoders.glmm import GLMMEncoder
2728
from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder
2829

@@ -51,4 +52,5 @@
5152
"GLMMEncoder",
5253
"QuantileEncoder",
5354
"SummaryEncoder",
55+
'RankHotEncoder',
5456
]

category_encoders/ordinal.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
import category_encoders.utils as util
66
import warnings
7+
from typing import Dict, List, Union
78

89
__author__ = 'willmcginnis'
910

@@ -30,7 +31,7 @@ class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
3031
a mapping of class to label to use for the encoding, optional.
3132
the dict contains the keys 'col' and 'mapping'.
3233
the value of 'col' should be the feature name.
33-
the value of 'mapping' should be a dictionary of 'original_label' to 'encoded_label'.
34+
the value of 'mapping' should be a dictionary or pd.Series of 'original_label' to 'encoded_label'.
3435
example mapping: [
3536
{'col': 'col1', 'mapping': {None: 0, 'a': 1, 'b': 2}},
3637
{'col': 'col2', 'mapping': {None: 0, 'x': 1, 'y': 2}}
@@ -87,6 +88,8 @@ def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, ret
8788
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
8889
handle_unknown=handle_unknown, handle_missing=handle_missing)
8990
self.mapping_supplied = mapping is not None
91+
if self.mapping_supplied:
92+
mapping = self._validate_supplied_mapping(mapping)
9093
self.mapping = mapping
9194

9295
@property
@@ -237,3 +240,28 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
237240
mapping_out.append({'col': col, 'mapping': data, 'data_type': X[col].dtype}, )
238241

239242
return X, mapping_out
243+
244+
def _validate_supplied_mapping(self, supplied_mapping: List[Dict[str, Union[str, Dict, pd.Series]]]) -> List[Dict[str, Union[str, pd.Series]]]:
245+
"""
246+
validate the supplied mapping and convert the actual mapping per column to a pandas series.
247+
:param supplied_mapping: mapping as list of dicts. They actual mapping can be either a dict or pd.Series
248+
:return: the mapping with all actual mappings being pandas series
249+
"""
250+
msg = "Invalid supplied mapping, must be of type List[Dict[str, Union[Dict, pd.Series]]]." \
251+
"For an example refer to the documentation"
252+
if not isinstance(supplied_mapping, list):
253+
raise ValueError(msg)
254+
for mapping_el in supplied_mapping:
255+
if not isinstance(mapping_el, dict):
256+
raise ValueError(msg)
257+
if "col" not in mapping_el:
258+
raise KeyError("Mapping must contain a key 'col' for each column to encode")
259+
if "mapping" not in mapping_el:
260+
raise KeyError("Mapping must contain a key 'mapping' for each column to encode")
261+
mapping = mapping_el["mapping"]
262+
if isinstance(mapping_el, dict):
263+
# convert to dict in order to standardise
264+
mapping_el["mapping"] = pd.Series(mapping)
265+
if "data_type" not in mapping_el:
266+
mapping_el["data_type"] = mapping_el["mapping"].index.dtype
267+
return supplied_mapping

category_encoders/rankhot.py

Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
import numpy as np
2+
import pandas as pd
3+
from category_encoders import OrdinalEncoder
4+
import category_encoders.utils as util
5+
6+
7+
class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
8+
"""The rank-hot encoder is similar to a one-hot encoder,
9+
except every feature up to and including the current rank is hot.
10+
This is also called thermometer encoding.
11+
12+
Parameters
13+
----------
14+
15+
verbose: int
16+
integer indicating verbosity of the output. 0 for none.
17+
cols: list
18+
a list of columns to encode, if None, all string columns will be encoded.
19+
drop_invariant: bool
20+
boolean for whether or not to drop columns with 0 variance.
21+
use_cat_names: bool
22+
if True, category values will be included in the encoded column names.
23+
Since this can result in duplicate column names,
24+
duplicates are suffixed with '#' symbol until a unique name is generated.
25+
If False, category indices will be used instead of the category values.
26+
handle_unknown: str
27+
options are 'error', 'value', 'return_nan'.
28+
The default is 'value'.
29+
'value': If an unknown label occurrs, it is represented as 0 array.
30+
'error': If an unknown label occurrs, error message is displayed.
31+
'return_nan': If an unknown label occurrs, np.nan is returned in all columns.
32+
handle_missing: str
33+
options are 'error', 'value' and 'return_nan'. The default is 'value'.
34+
Missing value also considered as unknown value in the final data set.
35+
36+
Example
37+
-------
38+
>>> from category_encoders import *
39+
>>> import pandas as pd
40+
>>> from sklearn.datasets import fetch_openml
41+
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
42+
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
43+
>>> y = bunch.target
44+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
45+
>>> enc = RankHotEncoder(cols=['CentralAir', 'Heating'], handle_unknown='indicator').fit(X, y)
46+
>>> numeric_dataset = enc.transform(X)
47+
>>> print(numeric_dataset.info())
48+
<class 'pandas.core.frame.DataFrame'>
49+
RangeIndex: 1460 entries, 0 to 1459
50+
Data columns (total 13 columns):
51+
# Column Non-Null Count Dtype
52+
--- ------ -------------- -----
53+
0 Id 1460 non-null float64
54+
1 MSSubClass 1460 non-null float64
55+
2 MSZoning 1460 non-null object
56+
3 LotFrontage 1201 non-null float64
57+
4 YearBuilt 1460 non-null float64
58+
5 Heating_1 1460 non-null int64
59+
6 Heating_2 1460 non-null int64
60+
7 Heating_3 1460 non-null int64
61+
8 Heating_4 1460 non-null int64
62+
9 Heating_5 1460 non-null int64
63+
10 Heating_6 1460 non-null int64
64+
11 CentralAir_1 1460 non-null int64
65+
12 CentralAir_2 1460 non-null int64
66+
dtypes: float64(4), int64(8), object(1)
67+
memory usage: 148.4+ KB
68+
None
69+
"""
70+
71+
prefit_ordinal = True
72+
encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE
73+
74+
def __init__(
75+
self,
76+
verbose=0,
77+
cols=None,
78+
drop_invariant=False,
79+
return_df=True,
80+
handle_missing="value",
81+
handle_unknown="value",
82+
use_cat_names=None,
83+
):
84+
super().__init__(
85+
verbose=verbose,
86+
cols=cols,
87+
drop_invariant=drop_invariant,
88+
return_df=return_df,
89+
handle_unknown=handle_unknown,
90+
handle_missing=handle_missing,
91+
)
92+
self._dim = None
93+
self.mapping = None
94+
self.use_cat_names = use_cat_names
95+
96+
def _fit(self, X, y, **kwargs):
97+
oe_missing_strat = {
98+
'error': 'error',
99+
'return_nan': 'return_nan',
100+
'value': 'value',
101+
'indicator': 'return_nan',
102+
}[self.handle_missing]
103+
# supply custom mapping in order to assure order of ordinal variable
104+
ordered_mapping = []
105+
for col in self.cols:
106+
oe_col = OrdinalEncoder(verbose=self.verbose, cols=[col], handle_unknown="value", handle_missing=oe_missing_strat)
107+
oe_col.fit(X[col].sort_values().to_frame(name=col))
108+
ordered_mapping += oe_col.mapping
109+
110+
self.ordinal_encoder = OrdinalEncoder(
111+
verbose=self.verbose, cols=self.cols, handle_unknown="value", handle_missing=oe_missing_strat, mapping=ordered_mapping
112+
)
113+
self.ordinal_encoder = self.ordinal_encoder.fit(X)
114+
115+
self.mapping = self.generate_mapping()
116+
117+
return self
118+
119+
def _transform(self, X_in, override_return_df=False):
120+
X = X_in.copy(deep=True)
121+
X = self.ordinal_encoder.transform(X)
122+
input_cols = X.columns.values.tolist()
123+
124+
if self.handle_unknown == "error":
125+
if X[self.cols].isin([-1]).any().any():
126+
raise ValueError("Columns to be encoded can not contain new values")
127+
128+
for switch, ordinal_switch in zip(self.mapping, self.ordinal_encoder.category_mapping):
129+
col = switch.get("col")
130+
mod = switch.get("mapping")
131+
encode_feature_series = X[col]
132+
133+
unknow_elements = encode_feature_series[encode_feature_series == -1]
134+
135+
encoding_dict = {i: list(row.values()) for i, row in mod.to_dict(orient="index").items()}
136+
if self.handle_unknown == "value":
137+
default_value = [0] * len(encoding_dict)
138+
elif self.handle_unknown == "return_nan":
139+
default_value = [np.nan] * len(encoding_dict)
140+
elif self.handle_unknown == "error":
141+
if not unknow_elements.empty:
142+
unknowns_str = ', '.join([str(x) for x in unknow_elements.unique()])
143+
msg = f"Unseen values {unknowns_str} during transform in column {col}."
144+
raise ValueError(msg)
145+
default_value = [0] * len(encoding_dict)
146+
else:
147+
raise ValueError(f"invalid option for 'handle_unknown' parameter: {self.handle_unknown}")
148+
149+
def apply_coding(row: pd.Series):
150+
val = row.iloc[0]
151+
if pd.isna(val):
152+
if self.handle_missing == "value":
153+
return default_value
154+
elif self.handle_missing == "return_nan":
155+
return [np.nan] * len(default_value)
156+
else:
157+
raise ValueError("Unhandled NaN")
158+
return encoding_dict.get(row.iloc[0], default_value)
159+
160+
encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand")
161+
encoded.columns = mod.columns
162+
163+
X = pd.concat([encoded, X], axis=1)
164+
165+
old_column_index = input_cols.index(col)
166+
input_cols[old_column_index:old_column_index + 1] = mod.columns
167+
X = X.reindex(columns=input_cols)
168+
169+
return X
170+
171+
def create_dataframe(self, X, encoded, key_col):
172+
173+
if not (isinstance(encoded, pd.DataFrame) or isinstance(encoded, pd.Series)):
174+
encoded = pd.DataFrame(encoded, columns=key_col)
175+
176+
X_ = pd.concat([encoded, X], axis=1)
177+
return X_
178+
179+
def inverse_transform(self, X_in):
180+
X = X_in.copy(deep=True)
181+
cols = X.columns.values.tolist()
182+
if self._dim is None:
183+
raise ValueError("Must train encoder before it can be used to inverse_transform data")
184+
185+
for switch, ordinal_mapping in zip(self.mapping, self.ordinal_encoder.category_mapping):
186+
col = switch.get("col")
187+
cats = switch.get("mapping")
188+
if col != ordinal_mapping.get("col"):
189+
raise ValueError("Column order of OrdinalEncoder and RankHotEncoder do not match")
190+
inv_map = {v: k for k, v in ordinal_mapping.get("mapping").to_dict().items()}
191+
192+
arrs = X[cats.columns]
193+
reencode = arrs.sum(axis=1).rename(col)
194+
195+
orig_dtype = ordinal_mapping.get("data_type")
196+
reencode2 = reencode.replace(inv_map).astype(orig_dtype)
197+
if np.any(reencode2[:] == 0):
198+
reencode2[reencode2[:] == 0] = np.nan
199+
200+
X = self.create_dataframe(X, reencode2, col)
201+
202+
first_inex = cols.index(cats.columns[0])
203+
last_index = cols.index(cats.columns[-1]) + 1
204+
205+
del cols[first_inex:last_index]
206+
cols.insert(self.ordinal_encoder.feature_names_out_.index(col), col)
207+
208+
X = X.reindex(columns=cols)
209+
210+
return X
211+
212+
def generate_mapping(self):
213+
mapping = []
214+
found_column_counts = {}
215+
216+
for switch in self.ordinal_encoder.mapping:
217+
col: str = switch.get("col")
218+
values: pd.Series = switch.get("mapping").copy(deep=True)
219+
220+
if self.handle_missing == "value":
221+
values = values[values > 0]
222+
223+
if len(values) == 0:
224+
continue
225+
226+
index = []
227+
new_columns = []
228+
229+
for cat_name, class_ in values.iteritems():
230+
if self.use_cat_names:
231+
n_col_name = f"{col}_{cat_name}"
232+
found_count = found_column_counts.get(n_col_name, 0)
233+
found_column_counts[n_col_name] = found_count + 1
234+
n_col_name += "#" * found_count
235+
else:
236+
n_col_name = f"{col}_{class_}"
237+
238+
index.append(class_)
239+
new_columns.append(n_col_name)
240+
241+
base_matrix = np.tril(np.ones((len(index), len(index)), dtype=int))
242+
base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index)
243+
244+
mapping.append({"col": col, "mapping": base_df})
245+
return mapping

category_encoders/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
273273

274274
def fit(self, X, y=None, **kwargs):
275275
"""Fits the encoder according to X and y.
276-
276+
277277
Parameters
278278
----------
279279

tests/test_encoders.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -627,14 +627,15 @@ def test_metamorphic(self):
627627
result5 = enc5.fit_transform(x5, y)
628628
self.assertTrue((result1.values == result5.values).all())
629629

630-
# gray encoder re-orders inputs so that nan is last, hence the output is changed
631-
if encoder_name != "GrayEncoder":
630+
# gray encoder and rankhot re-orders inputs so that nan is last, hence the output is changed
631+
if encoder_name not in ["GrayEncoder", "RankHotEncoder"]:
632632
enc6 = getattr(encoders, encoder_name)()
633633
result6 = enc6.fit_transform(x6, y)
634634
self.assertTrue((result1.values == result6.values).all())
635635

636636
# gray encoder actually does re-order inputs
637-
if encoder_name != "GrayEncoder":
637+
# rankhot encoder respects order, in this example the order is switched
638+
if encoder_name not in ["GrayEncoder", "RankHotEncoder"]:
638639
enc7 = getattr(encoders, encoder_name)()
639640
result7 = enc7.fit_transform(x7, y)
640641
self.assertTrue((result1.values == result7.values).all())

0 commit comments

Comments
 (0)