Skip to content

Commit e673b7c

Browse files
added gray encoder
1 parent bcb9d9e commit e673b7c

File tree

7 files changed

+210
-1
lines changed

7 files changed

+210
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
unreleased
22
==========
3+
* added gray encoder
34

45
v2.5.1.post0
56
============

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ __Unsupervised:__
2020
* Backward Difference Contrast [2][3]
2121
* BaseN [6]
2222
* Binary [5]
23+
* Gray [14]
2324
* Count [10]
2425
* Hashing [1]
2526
* Helmert Contrast [2][3]
@@ -147,4 +148,4 @@ References
147148
11. Transforming categorical features to numerical features. From https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
148149
12. Andrew Gelman and Jennifer Hill (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models. From https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf
149150
13. Carlos Mougan, David Masip, Jordi Nin and Oriol Pujol (2021). Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems. https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
150-
151+
14. Gray Encoding. From https://en.wikipedia.org/wiki/Gray_code

category_encoders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from category_encoders.backward_difference import BackwardDifferenceEncoder
1010
from category_encoders.binary import BinaryEncoder
11+
from category_encoders.gray import GrayEncoder
1112
from category_encoders.count import CountEncoder
1213
from category_encoders.hashing import HashingEncoder
1314
from category_encoders.helmert import HelmertEncoder
@@ -32,6 +33,7 @@
3233
__all__ = [
3334
"BackwardDifferenceEncoder",
3435
"BinaryEncoder",
36+
"GrayEncoder",
3537
"CountEncoder",
3638
"HashingEncoder",
3739
"HelmertEncoder",

category_encoders/gray.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""Gray encoding"""
2+
from functools import partialmethod
3+
4+
import pandas as pd
5+
6+
from category_encoders import utils
7+
from category_encoders.basen import BaseNEncoder
8+
from typing import List
9+
10+
__author__ = 'paulwestenthanner'
11+
12+
13+
class GrayEncoder(BaseNEncoder):
14+
"""Gray encoding for categorical variables.
15+
Gray encoding is a form of binary encoding where consecutive values only differ by a single bit.
16+
Hence, gray encoding only makes sense for ordinal features.
17+
This has benefits in privacy preserving data publishing.
18+
19+
Parameters
20+
----------
21+
22+
verbose: int
23+
integer indicating verbosity of the output. 0 for none.
24+
cols: list
25+
a list of columns to encode, if None, all string columns will be encoded.
26+
drop_invariant: bool
27+
boolean for whether or not to drop columns with 0 variance.
28+
return_df: bool
29+
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
30+
handle_unknown: str
31+
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
32+
an extra column will be added in if the transform matrix has unknown categories. This can cause
33+
unexpected changes in dimension in some cases.
34+
handle_missing: str
35+
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
36+
an extra column will be added in if the transform matrix has nan values. This can cause
37+
unexpected changes in dimension in some cases.
38+
39+
Example
40+
-------
41+
>>> from category_encoders import GrayEncoder
42+
>>> import pandas as pd
43+
>>> from sklearn.datasets import load_boston
44+
>>> bunch = load_boston()
45+
>>> y = bunch.target
46+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
47+
>>> enc = GrayEncoder(cols=['CHAS', 'RAD']).fit(X, y)
48+
>>> numeric_dataset = enc.transform(X)
49+
>>> print(numeric_dataset.info())
50+
<class 'pandas.core.frame.DataFrame'>
51+
RangeIndex: 506 entries, 0 to 505
52+
Data columns (total 18 columns):
53+
CRIM 506 non-null float64
54+
ZN 506 non-null float64
55+
INDUS 506 non-null float64
56+
CHAS_0 506 non-null int64
57+
CHAS_1 506 non-null int64
58+
NOX 506 non-null float64
59+
RM 506 non-null float64
60+
AGE 506 non-null float64
61+
DIS 506 non-null float64
62+
RAD_0 506 non-null int64
63+
RAD_1 506 non-null int64
64+
RAD_2 506 non-null int64
65+
RAD_3 506 non-null int64
66+
RAD_4 506 non-null int64
67+
TAX 506 non-null float64
68+
PTRATIO 506 non-null float64
69+
B 506 non-null float64
70+
LSTAT 506 non-null float64
71+
dtypes: float64(11), int64(7)
72+
memory usage: 71.3 KB
73+
None
74+
75+
References
76+
----------
77+
78+
.. [1] https://en.wikipedia.org/wiki/Gray_code
79+
.. [2] Jun Zhang, Graham Cormode, Cecilia M. Procopiuc, Divesh Srivastava, and Xiaokui Xiao. 2017. PrivBayes:
80+
Private Data Release via Bayesian Networks. ACM Trans. Database Syst. 42, 4, Article 25 (October 2017)
81+
"""
82+
encoding_relation = utils.EncodingRelation.ONE_TO_M
83+
__init__ = partialmethod(BaseNEncoder.__init__, base=2)
84+
85+
@staticmethod
86+
def gray_code(n, n_bit) -> List[int]:
87+
gray = n ^ (n >> 1)
88+
gray_formatted = "{0:0{1}b}".format(gray, n_bit)
89+
return [int(bit) for bit in gray_formatted]
90+
91+
def _fit(self, X, y=None, **kwargs):
92+
super(GrayEncoder, self)._fit(X, y, **kwargs)
93+
gray_mapping = []
94+
# convert binary mapping to Gray mapping and reorder
95+
for col_to_encode in self.mapping:
96+
col = col_to_encode["col"]
97+
bin_mapping = col_to_encode["mapping"]
98+
n_cols_out = bin_mapping.shape[1]
99+
map_null = bin_mapping[bin_mapping.index < 0]
100+
map_non_null = bin_mapping[bin_mapping.index >= 0].copy()
101+
ordinal_mapping = [m for m in self.ordinal_encoder.mapping if m.get("col") == col]
102+
if len(ordinal_mapping) != 1:
103+
raise ValueError("Cannot find ordinal encoder mapping of Gray encoder")
104+
ordinal_mapping = ordinal_mapping[0]["mapping"]
105+
reverse_ordinal_mapping = {v: k for k, v in ordinal_mapping.to_dict().items()}
106+
map_non_null["orig_value"] = map_non_null.index.to_series().map(reverse_ordinal_mapping)
107+
map_non_null = map_non_null.sort_values(by="orig_value")
108+
gray_encoding = [self.gray_code(i + 1, n_cols_out) for i in range(map_non_null.shape[0])]
109+
gray_encoding = pd.DataFrame(data=gray_encoding, index=map_non_null.index, columns=bin_mapping.columns)
110+
gray_encoding = pd.concat([gray_encoding, map_null])
111+
gray_mapping.append({"col": col, "mapping": gray_encoding})
112+
self.mapping = gray_mapping

docs/source/gray.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Gray
2+
====
3+
4+
.. autoclass:: category_encoders.gray.GaryEncoder
5+
:members:
6+
:inherited-members:

docs/source/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ To use:
4444
encoder = ce.CatBoostEncoder(cols=[...])
4545
encoder = ce.CountEncoder(cols=[...])
4646
encoder = ce.GLMMEncoder(cols=[...])
47+
encoder = ce.GrayEncoder(cols=[...])
4748
encoder = ce.HashingEncoder(cols=[...])
4849
encoder = ce.HelmertEncoder(cols=[...])
4950
encoder = ce.JamesSteinEncoder(cols=[...])
@@ -74,6 +75,7 @@ Contents:
7475
catboost
7576
count
7677
glmm
78+
gray
7779
hashing
7880
helmert
7981
jamesstein

tests/test_gray.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import pandas as pd
2+
from unittest import TestCase
3+
import numpy as np
4+
5+
import category_encoders as encoders
6+
7+
8+
class TestGrayEncoder(TestCase):
9+
10+
def test_gray_sorting(self):
11+
data = np.array(['ba', 'ba', 'aa'])
12+
out = encoders.GrayEncoder().fit_transform(data)
13+
expected = pd.DataFrame([[1, 1], [1, 1], [0, 1]], columns=['0_0', '0_1'])
14+
pd.testing.assert_frame_equal(out, expected)
15+
16+
def test_gray_mapping(self):
17+
train_data = pd.DataFrame()
18+
train_data["cat_col"] = np.array([4, 9, 6, 7, 7, 9])
19+
train_data["other_col"] = range(train_data.shape[0])
20+
encoder = encoders.GrayEncoder(cols=["cat_col"])
21+
encoder.fit(train_data)
22+
23+
expected_ordinal_mapping = {4.0: 1, 9.0: 2, 6.0: 3, 7.0: 4, "nan": -2}
24+
expected_mapping = pd.DataFrame(
25+
[
26+
[0, 0, 1],
27+
[0, 1, 1],
28+
[0, 1, 0],
29+
[1, 1, 0],
30+
[0, 0, 0],
31+
[0, 0, 0],
32+
], columns=[f"cat_col_{i}" for i in range(3)], index=[1, 3, 4, 2, -1, -2]
33+
)
34+
self.assertEqual(len(encoder.mapping), 1)
35+
self.assertEqual(len(encoder.mapping[0].keys()), 2)
36+
37+
actual_ordinal_encoding = encoder.ordinal_encoder.mapping[0]["mapping"]
38+
actual_ordinal_encoding.index = actual_ordinal_encoding.index.fillna("nan")
39+
self.assertDictEqual(actual_ordinal_encoding.to_dict(), expected_ordinal_mapping)
40+
pd.testing.assert_frame_equal(encoder.mapping[0]["mapping"], expected_mapping)
41+
42+
train_transformed = encoder.transform(train_data)
43+
train_data["cat_col"] = np.array([4, 9, 6, 7, 7, 9])
44+
expected_train_transformed = [
45+
[0, 0, 1, 0],
46+
[1, 1, 0, 1],
47+
[0, 1, 1, 2],
48+
[0, 1, 0, 3],
49+
[0, 1, 0, 4],
50+
[1, 1, 0, 5],
51+
]
52+
expected_train_transformed = pd.DataFrame(expected_train_transformed,
53+
columns=[f"cat_col_{i}" for i in range(3)] + ["other_col"],
54+
index=train_data.index)
55+
pd.testing.assert_frame_equal(train_transformed, expected_train_transformed)
56+
test_data = pd.DataFrame()
57+
test_data["cat_col"] = np.array([4, 3, None, np.nan])
58+
test_data["other_col"] = range(test_data.shape[0])
59+
expected_test_transformed = [
60+
[0, 0, 1, 0],
61+
[0, 0, 0, 1],
62+
[0, 0, 0, 2],
63+
[0, 0, 0, 3],
64+
]
65+
expected_test_transformed = pd.DataFrame(expected_test_transformed,
66+
columns=[f"cat_col_{i}" for i in range(3)] + ["other_col"],
67+
index=test_data.index)
68+
test_transformed = encoder.transform(test_data)
69+
pd.testing.assert_frame_equal(test_transformed, expected_test_transformed)
70+
71+
def test_gray_code(self):
72+
input_expected_output = {
73+
(0, 0): [0],
74+
(0, 1): [0],
75+
(0, 3): [0, 0, 0],
76+
(1, 1): [1],
77+
(1, 3): [0, 0, 1],
78+
(2, 2): [1, 1],
79+
(13, 4): [1, 0, 1, 1],
80+
(13, 6): [0, 0, 1, 0, 1, 1],
81+
}
82+
for test_input, expected_output in input_expected_output.items():
83+
n, n_bits = test_input
84+
out = encoders.GrayEncoder.gray_code(n, n_bits)
85+
self.assertEqual(out, expected_output)

0 commit comments

Comments
 (0)