Fixed OHE error with Scikit 0.19.0 (#191)

prabhat00155 · web-flow · commit c10d36153cdc · 2018-12-05T13:37:23.000Z
diff --git a/onnxmltools/convert/sklearn/operator_converters/OneHotEncoder.py b/onnxmltools/convert/sklearn/operator_converters/OneHotEncoder.py
@@ -14,21 +14,51 @@
 def convert_sklearn_one_hot_encoder(scope, operator, container):
     op = operator.raw_operator
     C = operator.inputs[0].type.shape[1]
-    categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
 
     # encoded_slot_sizes[i] is the number of output coordinates associated with the ith categorical feature
     categorical_values_per_feature = []
-    
-    categorical_values_per_feature = []
-    for cat in op.categories_:
-        if cat is None and len(cat) == 0:
-            continue
-        if cat.dtype in (numpy.float32, numpy.float64, numpy.int32, numpy.int64):
-            categorical_values_per_feature.append(list(cat.astype(numpy.int64)))
-        elif cat.dtype in (numpy.str, numpy.unicode, numpy.object):
-            categorical_values_per_feature.append([str(_) for _ in cat])
+    if hasattr(op, 'categories_'):
+        categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
+
+        for cat in op.categories_:
+            if cat is None and len(cat) == 0:
+                continue
+            if cat.dtype in (numpy.float32, numpy.float64, numpy.int32, numpy.int64):
+                categorical_values_per_feature.append(list(cat.astype(numpy.int64)))
+            elif cat.dtype in (numpy.str, numpy.unicode, numpy.object):
+                categorical_values_per_feature.append([str(_) for _ in cat])
+            else:
+                raise TypeError("Categories must be int or strings not {0}.".format(cat.dtype))
+    else:
+        if op.categorical_features == 'all':
+            categorical_feature_indices = [i for i in range(C)]
+        elif isinstance(op.categorical_features, collections.Iterable):
+            if all(isinstance(i, bool) for i in op.categorical_features):
+                categorical_feature_indices = [i for i, active in enumerate(op.categorical_features) if active]
+            else:
+                categorical_feature_indices = [int(i) for i in op.categorical_features]
+        else:
+            raise ValueError('Unknown operation mode')
+
+        if op.n_values == 'auto':
+            # Use active feature to determine output length
+            for i in range(len(op.feature_indices_) - 1):
+                allowed_values = []
+                index_head = op.feature_indices_[i]
+                index_tail = op.feature_indices_[i + 1]  # feature indexed by index_tail not included in this category
+                for j in op.active_features_:
+                    if index_head <= j and j < index_tail:
+                        allowed_values.append(j - index_head)
+                categorical_values_per_feature.append(allowed_values)
+        elif isinstance(op.n_values, numbers.Integral):
+            # Each categorical feature will be mapped to a fixed length one-hot sub-vector
+            for i in range(len(op.feature_indices_) - 1):
+                index_head = op.feature_indices_[i]
+                categorical_values_per_feature.append(list(i - index_head for i in range(op.n_values)))
         else:
-            raise TypeError("Categories must be int or strings not {0}.".format(cat.dtype))
+            # Each categorical feature has its own sub-vector length
+            for max_index in op.n_values:
+                categorical_values_per_feature.append(list(i for i in range(max_index)))
 
     # Variable names produced by one-hot encoders. Each of them is the encoding result of a categorical feature.
     final_variable_names = []
diff --git a/onnxmltools/convert/sklearn/shape_calculators/OneHotEncoder.py b/onnxmltools/convert/sklearn/shape_calculators/OneHotEncoder.py
@@ -34,21 +34,56 @@ def calculate_sklearn_one_hot_encoder_output_shapes(operator):
     '''
     op = operator.raw_operator
 
-    categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
-
-    # Calculate the number of allowed categorical values in each original categorical coordinate.
     # encoded_slot_sizes[i] is the number of output coordinates associated with the ith categorical feature.
     encoded_slot_sizes = []
+    if hasattr(op, 'categories_'):
+        categorical_feature_indices = [i for i, mat in enumerate(op.categories_) if mat is not None and len(mat) > 0]
+
+        # Calculate the number of allowed categorical values in each original categorical coordinate.
+        # Use active feature to determine output length
+        index_head = 0
+        for i in range(len(op.categories_)):
+            if op.categories_[i] is None or len(op.categories_[i]) == 0:
+                continue
+            categorical_size = op.categories_[i].shape[0]
+            # feature indexed by index_tail is not included in this category
+            index_tail = index_head + categorical_size
+            encoded_slot_sizes.append(categorical_size)
+    else:
+        if op.categorical_features == 'all':
+            # In this case, all features need to be encoded
+            C = operator.inputs[0].type.shape[1]
+            categorical_feature_indices = [i for i in range(C)]
+        elif isinstance(op.categorical_features, collections.Iterable):
+            # In this case, there are two formats to specify which features are encoded.
+            if all(isinstance(i, (bool, np.bool_)) for i in op.categorical_features):
+                # op.categorical_features is a binary vector. Its ith element is 0/1 if the ith coordinate is not encoded/
+                # encoded.
+                categorical_feature_indices = [i for i, active in enumerate(op.categorical_features) if active]
+            else:
+                # op.categorical_features is a vector containing all categorical features' indexes.
+                categorical_feature_indices = [int(i) for i in op.categorical_features]
+        else:
+            raise ValueError('Unknown operation mode')
 
-    # Use active feature to determine output length
-    index_head = 0
-    for i in range(len(op.categories_)):
-        if op.categories_[i] is None or len(op.categories_[i]) == 0:
-            continue
-        categorical_size = op.categories_[i].shape[0]
-        # feature indexed by index_tail is not included in this category
-        index_tail = index_head + categorical_size
-        encoded_slot_sizes.append(categorical_size)
+        # Calculate the number of allowed categorical values in each original categorical coordinate.
+        if op.n_values == 'auto':
+            # Use active feature to determine output length
+            for i in range(len(op.feature_indices_) - 1):
+                categorical_size = 0
+                index_head = op.feature_indices_[i]
+                index_tail = op.feature_indices_[i + 1]  # feature indexed by index_tail is not included in this category
+                for j in op.active_features_:
+                    if index_head <= j and j < index_tail:
+                        categorical_size += 1
+                encoded_slot_sizes.append(categorical_size)
+        elif isinstance(op.n_values, numbers.Integral):
+            # Each categorical feature will be mapped to a fixed length one-hot sub-vector
+            for i in range(len(op.feature_indices_) - 1):
+                encoded_slot_sizes.append(op.n_values)
+        else:
+            # Each categorical feature has its own sub-vector length
+            encoded_slot_sizes = [i for i in op.n_values]
 
     N = operator.inputs[0].type.shape[0]
     # Calculate the output feature length by replacing the count of categorical