Skip to content

Commit 0f839e0

Browse files
Merge branch 'master' into jona/fix_pandas_future_warning_for_dropping_invariants
2 parents ce16b23 + 531a271 commit 0f839e0

File tree

8 files changed

+54
-45
lines changed

8 files changed

+54
-45
lines changed

.github/workflows/docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
runs-on: ubuntu-latest
99
steps:
1010
- name: Clone
11-
uses: actions/checkout@v1
11+
uses: actions/checkout@v2
1212
- name: Dependencies
1313
run: |
1414
python -m pip install --upgrade pip wheel

.github/workflows/pypi-publish.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ jobs:
88
runs-on: ubuntu-latest
99
steps:
1010
- name: Clone
11-
uses: actions/checkout@v1
11+
uses: actions/checkout@v2
1212
- name: Set up Python 3.7
13-
uses: actions/setup-python@v1
13+
uses: actions/setup-python@v2
1414
with:
1515
python-version: 3.7
1616
- name: Build package
@@ -24,4 +24,4 @@ jobs:
2424
uses: pypa/gh-action-pypi-publish@master
2525
with:
2626
user: willmcg4132
27-
password: ${{ secrets.pypi_password }}
27+
password: ${{ secrets.pypi_password }}

.github/workflows/test-docs-build.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,14 @@ on:
55
jobs:
66
docs:
77
runs-on: ubuntu-latest
8+
strategy:
9+
matrix:
10+
python-version: ['3.10']
811
steps:
9-
- uses: actions/checkout@v1
12+
- uses: actions/checkout@v2
13+
- uses: actions/setup-python@v2
14+
with:
15+
python-version: ${{ matrix.python-version }}
1016
- uses: ammaraskar/sphinx-action@master
1117
with:
12-
docs-folder: "docs/"
18+
docs-folder: "docs/"

.github/workflows/test-suite.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ jobs:
1515
runs-on: ubuntu-latest
1616
strategy:
1717
matrix:
18-
python-version: [3.5, 3.6, 3.7, 3.8]
18+
python-version: ['3.5', '3.6', '3.7', '3.8', '3.9', '3.10']
1919

2020
steps:
2121
- uses: actions/checkout@v2
2222
- name: Set up Python ${{ matrix.python-version }}
23-
uses: actions/setup-python@v1
23+
uses: actions/setup-python@v2
2424
with:
2525
python-version: ${{ matrix.python-version }}
2626
- name: Install dependencies
@@ -30,4 +30,4 @@ jobs:
3030
python -m pip install -r requirements-dev.txt
3131
- name: Test with pytest
3232
run: |
33-
pytest
33+
pytest

category_encoders/count.py

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
import numpy as np
55
import pandas as pd
66
import category_encoders.utils as util
7+
from category_encoders.ordinal import OrdinalEncoder
78

89
from copy import copy
910
from sklearn.base import BaseEstimator, TransformerMixin
1011

1112

1213
__author__ = 'joshua t. dunn'
1314

14-
# COUNT_ENCODER BRANCH
15+
1516
class CountEncoder(BaseEstimator, TransformerMixin):
17+
1618
def __init__(self, verbose=0, cols=None, drop_invariant=False,
1719
return_df=True, handle_unknown='value',
1820
handle_missing='value',
@@ -118,6 +120,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
118120
self.min_group_name = min_group_name
119121
self.combine_min_nan_groups = combine_min_nan_groups
120122
self.feature_names = None
123+
self.ordinal_encoder = None
121124

122125
self._check_set_create_attrs()
123126

@@ -157,9 +160,17 @@ def fit(self, X, y=None, **kwargs):
157160
else:
158161
self.cols = util.convert_cols_to_list(self.cols)
159162

163+
self.ordinal_encoder = OrdinalEncoder(
164+
verbose=self.verbose,
165+
cols=self.cols,
166+
handle_unknown='value',
167+
handle_missing='value'
168+
)
169+
self.ordinal_encoder = self.ordinal_encoder.fit(X)
170+
X_ordinal = self.ordinal_encoder.transform(X)
160171
self._check_set_create_dict_attrs()
161172

162-
self._fit_count_encode(X, y)
173+
self._fit_count_encode(X_ordinal, y)
163174

164175
X_temp = self.transform(X, override_return_df=True)
165176
self.feature_names = list(X_temp.columns)
@@ -235,28 +246,11 @@ def _fit_count_encode(self, X_in, y):
235246
self.mapping = {}
236247

237248
for col in self.cols:
238-
if X[col].isnull().any():
239-
if self._handle_missing[col] == 'error':
240-
raise ValueError(
241-
'Missing data found in column %s at fit time.'
242-
% (col,)
243-
)
244-
245-
elif self._handle_missing[col] not in ['value', 'return_nan', 'error', None]:
246-
raise ValueError(
247-
'%s key in `handle_missing` should be one of: '
248-
' `value`, `return_nan` and `error` not `%s`.'
249-
% (col, str(self._handle_missing[col]))
250-
)
251-
252-
self.mapping[col] = X[col].value_counts(
253-
normalize=self._normalize[col],
254-
dropna=False
255-
)
256-
257-
self.mapping[col].index = self.mapping[col].index.astype(object)
258-
259-
249+
mapping_values = X[col].value_counts(normalize=self._normalize[col])
250+
ordinal_encoding = [m["mapping"] for m in self.ordinal_encoder.mapping if m["col"] == col][0]
251+
reversed_ordinal_enc = {v: k for k, v in ordinal_encoding.to_dict().items()}
252+
mapping_values.index = mapping_values.index.map(reversed_ordinal_enc)
253+
self.mapping[col] = mapping_values
260254

261255
if self._handle_missing[col] == 'return_nan':
262256
self.mapping[col][np.NaN] = np.NaN
@@ -272,15 +266,15 @@ def _transform_count_encode(self, X_in, y):
272266
X = X_in.copy(deep=True)
273267

274268
for col in self.cols:
275-
276-
X[col] = X.fillna(value=np.nan)[col]
269+
# Treat None as np.nan
270+
X[col] = pd.Series([el if el is not None else np.NaN for el in X[col]], index=X[col].index)
271+
if self.handle_missing == "value":
272+
if not util.is_category(X[col].dtype):
273+
X[col] = X[col].fillna(np.nan)
277274

278275
if self._min_group_size is not None:
279276
if col in self._min_group_categories.keys():
280-
X[col] = (
281-
X[col].map(self._min_group_categories[col])
282-
.fillna(X[col])
283-
)
277+
X[col] = X[col].map(self._min_group_categories[col]).fillna(X[col])
284278

285279
X[col] = X[col].astype(object).map(self.mapping[col])
286280
if isinstance(self._handle_unknown[col], (int, np.integer)):

category_encoders/ordinal.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -289,15 +289,14 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
289289
for switch in mapping:
290290
column = switch.get('col')
291291
col_mapping = switch['mapping']
292-
X[column] = X[column].map(col_mapping)
293292

293+
# Treat None as np.nan
294+
X[column] = pd.Series([el if el is not None else np.NaN for el in X[column]], index=X[column].index)
295+
X[column] = X[column].map(col_mapping)
294296
if util.is_category(X[column].dtype):
295-
if not isinstance(col_mapping, pd.Series):
296-
col_mapping = pd.Series(col_mapping)
297297
nan_identity = col_mapping.loc[col_mapping.index.isna()].values[0]
298298
X[column] = X[column].cat.add_categories(nan_identity)
299299
X[column] = X[column].fillna(nan_identity)
300-
301300
try:
302301
X[column] = X[column].astype(int)
303302
except ValueError as e:

tests/test_count.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_count_handle_missing_string(self):
6262

6363
def test_count_handle_missing_dict(self):
6464
"""Test the handle_missing dict on 'none' and 'na_categorical'.
65-
We want to see differing behavour between 'none' and 'na_cat' cols."""
65+
We want to see differing behaviour between 'none' and 'na_cat' cols."""
6666
enc = encoders.CountEncoder(
6767
handle_missing={'na_categorical': 'return_nan'}
6868
)
@@ -169,7 +169,7 @@ def test_count_combine_min_nan_groups_bool(self):
169169
self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all())
170170
self.assertEqual(out['na_categorical'].unique().shape[0], 3)
171171
self.assertTrue(enc.mapping is not None)
172-
self.assertIn(np.nan, enc.mapping['na_categorical'])
172+
self.assertIn(np.NaN, enc.mapping['na_categorical'])
173173

174174
def test_count_combine_min_nan_groups_dict(self):
175175
"""Test the combine_min_nan_groups dict on 'none' and 'na_categorical'."""

tests/test_ordinal.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,16 @@ def test_HaveNoneAndNan_ExpectCodesAsOne(self):
161161

162162
self.assertEqual(expected, result)
163163

164+
new_nan = pd.DataFrame({'city': [np.nan,]})
165+
result_new_nan = enc.transform(new_nan)['city'].tolist()
166+
expected_new_nan = [1]
167+
self.assertEqual(expected_new_nan, result_new_nan)
168+
169+
new_none = pd.DataFrame({'city': [None, ]})
170+
result_new_none = enc.transform(new_none)['city'].tolist()
171+
expected_new_none = [1]
172+
self.assertEqual(expected_new_none, result_new_none)
173+
164174
def test_inverse_transform_HaveUnknown_ExpectWarning(self):
165175
train = pd.DataFrame({'city': ['chicago', 'st louis']})
166176
test = pd.DataFrame({'city': ['chicago', 'los angeles']})

0 commit comments

Comments
 (0)