Skip to content

Commit c10d361

Browse files
authored
Fixed OHE error with Scikit 0.19.0 (#191)
1 parent 005f426 commit c10d361

File tree

2 files changed

+88
-23
lines changed

2 files changed

+88
-23
lines changed

onnxmltools/convert/sklearn/operator_converters/OneHotEncoder.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,51 @@
1414
def convert_sklearn_one_hot_encoder(scope, operator, container):
1515
op = operator.raw_operator
1616
C = operator.inputs[0].type.shape[1]
17-
categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
1817

1918
# encoded_slot_sizes[i] is the number of output coordinates associated with the ith categorical feature
2019
categorical_values_per_feature = []
21-
22-
categorical_values_per_feature = []
23-
for cat in op.categories_:
24-
if cat is None and len(cat) == 0:
25-
continue
26-
if cat.dtype in (numpy.float32, numpy.float64, numpy.int32, numpy.int64):
27-
categorical_values_per_feature.append(list(cat.astype(numpy.int64)))
28-
elif cat.dtype in (numpy.str, numpy.unicode, numpy.object):
29-
categorical_values_per_feature.append([str(_) for _ in cat])
20+
if hasattr(op, 'categories_'):
21+
categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
22+
23+
for cat in op.categories_:
24+
if cat is None and len(cat) == 0:
25+
continue
26+
if cat.dtype in (numpy.float32, numpy.float64, numpy.int32, numpy.int64):
27+
categorical_values_per_feature.append(list(cat.astype(numpy.int64)))
28+
elif cat.dtype in (numpy.str, numpy.unicode, numpy.object):
29+
categorical_values_per_feature.append([str(_) for _ in cat])
30+
else:
31+
raise TypeError("Categories must be int or strings not {0}.".format(cat.dtype))
32+
else:
33+
if op.categorical_features == 'all':
34+
categorical_feature_indices = [i for i in range(C)]
35+
elif isinstance(op.categorical_features, collections.Iterable):
36+
if all(isinstance(i, bool) for i in op.categorical_features):
37+
categorical_feature_indices = [i for i, active in enumerate(op.categorical_features) if active]
38+
else:
39+
categorical_feature_indices = [int(i) for i in op.categorical_features]
40+
else:
41+
raise ValueError('Unknown operation mode')
42+
43+
if op.n_values == 'auto':
44+
# Use active feature to determine output length
45+
for i in range(len(op.feature_indices_) - 1):
46+
allowed_values = []
47+
index_head = op.feature_indices_[i]
48+
index_tail = op.feature_indices_[i + 1] # feature indexed by index_tail not included in this category
49+
for j in op.active_features_:
50+
if index_head <= j and j < index_tail:
51+
allowed_values.append(j - index_head)
52+
categorical_values_per_feature.append(allowed_values)
53+
elif isinstance(op.n_values, numbers.Integral):
54+
# Each categorical feature will be mapped to a fixed length one-hot sub-vector
55+
for i in range(len(op.feature_indices_) - 1):
56+
index_head = op.feature_indices_[i]
57+
categorical_values_per_feature.append(list(i - index_head for i in range(op.n_values)))
3058
else:
31-
raise TypeError("Categories must be int or strings not {0}.".format(cat.dtype))
59+
# Each categorical feature has its own sub-vector length
60+
for max_index in op.n_values:
61+
categorical_values_per_feature.append(list(i for i in range(max_index)))
3262

3363
# Variable names produced by one-hot encoders. Each of them is the encoding result of a categorical feature.
3464
final_variable_names = []

onnxmltools/convert/sklearn/shape_calculators/OneHotEncoder.py

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,21 +34,56 @@ def calculate_sklearn_one_hot_encoder_output_shapes(operator):
3434
'''
3535
op = operator.raw_operator
3636

37-
categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
38-
39-
# Calculate the number of allowed categorical values in each original categorical coordinate.
4037
# encoded_slot_sizes[i] is the number of output coordinates associated with the ith categorical feature.
4138
encoded_slot_sizes = []
39+
if hasattr(op, 'categories_'):
40+
categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
41+
42+
# Calculate the number of allowed categorical values in each original categorical coordinate.
43+
# Use active feature to determine output length
44+
index_head = 0
45+
for i in range(len(op.categories_)):
46+
if op.categories_[i] is None or len(op.categories_[i]) == 0:
47+
continue
48+
categorical_size = op.categories_[i].shape[0]
49+
# feature indexed by index_tail is not included in this category
50+
index_tail = index_head + categorical_size
51+
encoded_slot_sizes.append(categorical_size)
52+
else:
53+
if op.categorical_features == 'all':
54+
# In this case, all features need to be encoded
55+
C = operator.inputs[0].type.shape[1]
56+
categorical_feature_indices = [i for i in range(C)]
57+
elif isinstance(op.categorical_features, collections.Iterable):
58+
# In this case, there are two formats to specify which features are encoded.
59+
if all(isinstance(i, (bool, np.bool_)) for i in op.categorical_features):
60+
# op.categorical_features is a binary vector. Its ith element is 0/1 if the ith coordinate is not encoded/
61+
# encoded.
62+
categorical_feature_indices = [i for i, active in enumerate(op.categorical_features) if active]
63+
else:
64+
# op.categorical_features is a vector containing all categorical features' indexes.
65+
categorical_feature_indices = [int(i) for i in op.categorical_features]
66+
else:
67+
raise ValueError('Unknown operation mode')
4268

43-
# Use active feature to determine output length
44-
index_head = 0
45-
for i in range(len(op.categories_)):
46-
if op.categories_[i] is None or len(op.categories_[i]) == 0:
47-
continue
48-
categorical_size = op.categories_[i].shape[0]
49-
# feature indexed by index_tail is not included in this category
50-
index_tail = index_head + categorical_size
51-
encoded_slot_sizes.append(categorical_size)
69+
# Calculate the number of allowed categorical values in each original categorical coordinate.
70+
if op.n_values == 'auto':
71+
# Use active feature to determine output length
72+
for i in range(len(op.feature_indices_) - 1):
73+
categorical_size = 0
74+
index_head = op.feature_indices_[i]
75+
index_tail = op.feature_indices_[i + 1] # feature indexed by index_tail is not included in this category
76+
for j in op.active_features_:
77+
if index_head <= j and j < index_tail:
78+
categorical_size += 1
79+
encoded_slot_sizes.append(categorical_size)
80+
elif isinstance(op.n_values, numbers.Integral):
81+
# Each categorical feature will be mapped to a fixed length one-hot sub-vector
82+
for i in range(len(op.feature_indices_) - 1):
83+
encoded_slot_sizes.append(op.n_values)
84+
else:
85+
# Each categorical feature has its own sub-vector length
86+
encoded_slot_sizes = [i for i in op.n_values]
5287

5388
N = operator.inputs[0].type.shape[0]
5489
# Calculate the output feature length by replacing the count of categorical

0 commit comments

Comments
 (0)