Fixed XGboost classifier converter output labels (#336)

Prabhat · web-flow · commit 060a71ef947f · 2019-09-18T21:47:56.000+01:00
* Fixed XGboost classifier converter output labels

* Removed commented out code

* Fixed typo

* Added unit test with discrete int labels

* Fixed XGBoost converters' unit tests
diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py
@@ -4,15 +4,10 @@
 # license information.
 # --------------------------------------------------------------------------
 
-import ctypes
-import numbers
-import numpy
 import json
-from xgboost import XGBRegressor, XGBClassifier
-from xgboost.core import _LIB, _check_call, from_cstr_to_pystr
-from ...common.tree_ensemble import get_default_tree_classifier_attribute_pairs
+import numpy as np
+from xgboost import XGBClassifier
 from ...common._registration import register_converter
-from ...common import utils
 from ..common import get_xgb_params
 
 
@@ -29,7 +24,7 @@ def get_xgb_params(xgb_node):
     def validate(xgb_node):
         params = XGBConverter.get_xgb_params(xgb_node)
         try:
-            if not "objective" in params:
+            if "objective" not in params:
                 raise AttributeError('ojective')
         except AttributeError as e:
             raise RuntimeError('Missing attribute in XGBoost model ' + str(e))
@@ -238,7 +233,13 @@ def convert(scope, operator, container):
             attr_pairs['class_ids'] = [v % ncl for v in attr_pairs['class_treeids']]
         class_labels = list(range(ncl))
 
-        attr_pairs['classlabels_int64s'] = class_labels 
+        classes = xgb_node.classes_
+        if (np.issubdtype(classes.dtype, np.floating) or
+                np.issubdtype(classes.dtype, np.signedinteger)):
+            attr_pairs['classlabels_int64s'] = classes.astype('int')
+        else:
+            classes = np.array([s.encode('utf-8') for s in classes])
+            attr_pairs['classlabels_strings'] = classes
 
         # add nodes
         if objective == "binary:logistic":
@@ -262,7 +263,6 @@ def convert(scope, operator, container):
             raise RuntimeError("Unexpected objective: {0}".format(objective))            
 
 
-
 def convert_xgboost(scope, operator, container):
     xgb_node = operator.raw_operator
     if isinstance(xgb_node, XGBClassifier):
diff --git a/onnxmltools/convert/xgboost/shape_calculators/Classifier.py b/onnxmltools/convert/xgboost/shape_calculators/Classifier.py
@@ -4,9 +4,13 @@
 # license information.
 # --------------------------------------------------------------------------
 
+import numpy as np
 from ...common._registration import register_shape_calculator
 from ...common.utils import check_input_and_output_numbers, check_input_and_output_types
-from ...common.data_types import Int64TensorType, FloatTensorType, DictionaryType, SequenceType
+from ...common.data_types import (
+    DictionaryType, FloatTensorType, Int64TensorType,
+    SequenceType, StringTensorType,
+)
 from ..common import get_xgb_params
 
 
@@ -28,7 +32,12 @@ def calculate_xgboost_classifier_output_shapes(operator):
         ncl = ntrees // params['n_estimators']
         if objective == "reg:logistic" and ncl == 1:
             ncl = 2
-    operator.outputs[0].type = Int64TensorType(shape=[N])
+    classes = xgb_node.classes_
+    if (np.issubdtype(classes.dtype, np.floating) or
+            np.issubdtype(classes.dtype, np.signedinteger)):
+        operator.outputs[0].type = Int64TensorType(shape=[N])
+    else:
+        operator.outputs[0].type = StringTensorType(shape=[N])
     operator.outputs[1].type = operator.outputs[1].type = FloatTensorType([N, ncl])
 
 
diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py
@@ -3,77 +3,166 @@
 """
 import sys
 import unittest
-from sklearn.datasets import load_iris
+import numpy as np
+from sklearn.datasets import load_diabetes, load_iris, make_classification
+from sklearn.model_selection import train_test_split
 from xgboost import XGBRegressor, XGBClassifier
 from onnxmltools.convert import convert_xgboost
 from onnxmltools.convert.common.data_types import FloatTensorType
-from onnxmltools.utils import dump_multiple_classification, dump_single_regression, dump_binary_classification
+from onnxmltools.utils import dump_data_and_model
+
+
+def _fit_classification_model(model, n_classes, is_str=False):
+    x, y = make_classification(n_classes=n_classes, n_features=100,
+                               n_samples=1000,
+                               random_state=42, n_informative=7)
+    y = y.astype(np.str) if is_str else y.astype(np.int64)
+    x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5,
+                                                   random_state=42)
+    model.fit(x_train, y_train)
+    return model, x_test.astype(np.float32)
 
 
 class TestXGBoostModels(unittest.TestCase):
 
-    @unittest.skipIf(sys.version_info[0] == 2, reason="xgboost converter not tested on python 2")
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
     def test_xgb_regressor(self):
-        iris = load_iris()
-        X = iris.data[:, :2]
+        iris = load_diabetes()
+        x = iris.data
         y = iris.target
-
+        x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5,
+                                                       random_state=42)
         xgb = XGBRegressor()
-        xgb.fit(X, y)
-        conv_model = convert_xgboost(xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
+        xgb.fit(x_train, y_train)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
         self.assertTrue(conv_model is not None)
-        dump_single_regression(xgb, suffix="-Dec4")
+        dump_data_and_model(
+            x_test.astype("float32"),
+            xgb,
+            conv_model,
+            basename="SklearnXGBRegressor-Dec4",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
 
-    @unittest.skipIf(sys.version_info[0] == 2, reason="xgboost converter not tested on python 2")
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
     def test_xgb_classifier(self):
-        iris = load_iris()
-        X = iris.data[:, :2]
-        y = iris.target
-        y[y == 2] = 0
-
-        xgb = XGBClassifier()
-        xgb.fit(X, y)
-        conv_model = convert_xgboost(xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
+        xgb, x_test = _fit_classification_model(XGBClassifier(), 2)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
         self.assertTrue(conv_model is not None)
-        dump_binary_classification(xgb)
+        dump_data_and_model(
+            x_test,
+            xgb,
+            conv_model,
+            basename="SklearnXGBClassifier",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
 
-    @unittest.skipIf(sys.version_info[0] == 2, reason="xgboost converter not tested on python 2")
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
     def test_xgb_classifier_multi(self):
-        iris = load_iris()
-        X = iris.data[:, :2]
-        y = iris.target
-
-        xgb = XGBClassifier()
-        xgb.fit(X, y)
-        conv_model = convert_xgboost(xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
+        xgb, x_test = _fit_classification_model(XGBClassifier(), 3)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
         self.assertTrue(conv_model is not None)
-        dump_multiple_classification(xgb, allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')")
+        dump_data_and_model(
+            x_test,
+            xgb,
+            conv_model,
+            basename="SklearnXGBClassifierMulti",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
 
-    @unittest.skipIf(sys.version_info[0] == 2, reason="xgboost converter not tested on python 2")
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
     def test_xgb_classifier_multi_reglog(self):
-        iris = load_iris()
-        X = iris.data[:, :2]
-        y = iris.target
-
-        xgb = XGBClassifier(objective='reg:logistic')
-        xgb.fit(X, y)
-        conv_model = convert_xgboost(xgb, initial_types=[('input', FloatTensorType(shape=[1, 2]))])
+        xgb, x_test = _fit_classification_model(
+            XGBClassifier(objective='reg:logistic'), 4)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 2]))])
         self.assertTrue(conv_model is not None)
-        dump_multiple_classification(xgb, suffix="RegLog",
-                                     allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')")
+        dump_data_and_model(
+            x_test,
+            xgb,
+            conv_model,
+            basename="SklearnXGBClassifierMultiRegLog",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
 
-    @unittest.skipIf(sys.version_info[0] == 2, reason="xgboost converter not tested on python 2")
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
     def test_xgb_classifier_reglog(self):
+        xgb, x_test = _fit_classification_model(
+            XGBClassifier(objective='reg:logistic'), 2)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 2]))])
+        self.assertTrue(conv_model is not None)
+        dump_data_and_model(
+            x_test,
+            xgb,
+            conv_model,
+            basename="SklearnXGBClassifierRegLog",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
+
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
+    def test_xgb_classifier_multi_str_labels(self):
+        xgb, x_test = _fit_classification_model(
+            XGBClassifier(n_estimators=4), 5, is_str=True)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
+        self.assertTrue(conv_model is not None)
+        dump_data_and_model(
+            x_test,
+            xgb,
+            conv_model,
+            basename="SklearnXGBClassifierMultiStrLabels",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
+
+    @unittest.skipIf(sys.version_info[0] == 2,
+                     reason="xgboost converter not tested on python 2")
+    def test_xgb_classifier_multi_discrete_int_labels(self):
         iris = load_iris()
-        X = iris.data[:, :2]
+        x = iris.data[:, :2]
         y = iris.target
-        y[y == 2] = 0
-
-        xgb = XGBClassifier(objective='reg:logistic')
-        xgb.fit(X, y)
-        conv_model = convert_xgboost(xgb, initial_types=[('input', FloatTensorType(shape=[1, 2]))])
+        y[y == 0] = 10
+        y[y == 1] = 20
+        y[y == 2] = -30
+        x_train, x_test, y_train, _ = train_test_split(x,
+                                                       y,
+                                                       test_size=0.5,
+                                                       random_state=42)
+        xgb = XGBClassifier(n_estimators=3)
+        xgb.fit(x_train, y_train)
+        conv_model = convert_xgboost(
+            xgb, initial_types=[('input', FloatTensorType(shape=[1, 'None']))])
         self.assertTrue(conv_model is not None)
-        dump_binary_classification(xgb, suffix="RegLog")
+        dump_data_and_model(
+            x_test.astype("float32"),
+            xgb,
+            conv_model,
+            basename="SklearnXGBClassifierMultiDiscreteIntLabels",
+            allow_failure="StrictVersion("
+            "onnx.__version__)"
+            "< StrictVersion('1.3.0')",
+        )
 
 
 if __name__ == "__main__":