feat: add support for lightgbm.Booster (#329)

hongzmsft · xadupre · commit d2c8fb75454f · 2019-08-26T16:38:08.000+02:00
For users who need to train a LightGBM model on huge datasets that do
not fit the memory, they often choose to train through the Booster API,
which supports two_round loading of huge datasets, or the command line,
which exports a model.txt file that can only be reconstructed to a
Booster. Currently the ONNX converter does not support either case.

Most of the converter work is about processing information from the
dumped model dictionary. Other information can also be inferred from
that information as well. Here we wrap the Booster information to
facilitate the conversion process.

The multiclass model is not supported yet. The exported ONNX model has
an issue with its ZipMap node.
diff --git a/onnxmltools/convert/lightgbm/_parse.py b/onnxmltools/convert/lightgbm/_parse.py
@@ -3,9 +3,11 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+import numpy
 
 from ..common._container import LightGbmModelContainer
 from ..common._topology import *
+from ..common.data_types import FloatTensorType
 
 from lightgbm import LGBMClassifier, LGBMRegressor
 
@@ -16,14 +18,44 @@
 lightgbm_operator_name_map = {LGBMClassifier: 'LgbmClassifier',
                               LGBMRegressor: 'LgbmRegressor'}
 
-
-def _get_lightgbm_operator_name(model_type):
+class WrappedBooster:
+
+    def __init__(self, booster):
+        self.booster_ = booster
+        _model_dict = self.booster_.dump_model()
+        self.classes_ = self._generate_classes(_model_dict)
+        self.n_features_ = len(_model_dict['feature_names'])
+        if _model_dict['objective'].startswith('binary'):
+            self.operator_name = 'LgbmClassifier'
+        elif _model_dict['objective'].startswith('regression'):
+            self.operator_name = 'LgbmRegressor'
+        else:
+            # Multiclass classifier is not supported at the moment. The exported ONNX model
+            # has an issue in ZipMap node.
+            raise ValueError('unsupported LightGbm objective: {}'.format(_model_dict['objective']))
+        if _model_dict.get('average_output', False):
+            self.boosting_type = 'rf'
+        else:
+            # Other than random forest, other boosting types do not affect later conversion.
+            # Here `gbdt` is chosen for no reason.
+            self.boosting_type = 'gbdt'
+
+    def _generate_classes(self, model_dict):
+        if model_dict['num_class'] == 1:
+            return numpy.asarray([0, 1])
+        return numpy.arange(model_dict['num_class'])
+
+
+def _get_lightgbm_operator_name(model):
     '''
     Get operator name of the input argument
 
-    :param model_type:  A lightgbm object.
+    :param model:  A lightgbm object.
     :return: A string which stands for the type of the input model in our conversion framework
     '''
+    if isinstance(model, WrappedBooster):
+        return model.operator_name
+    model_type = type(model)
     if model_type not in lightgbm_operator_name_map:
         raise ValueError("No proper operator name found for '%s'" % model_type)
     return lightgbm_operator_name_map[model_type]
@@ -38,10 +70,11 @@ def _parse_lightgbm_simple_model(scope, model, inputs):
     :param inputs: A list of variables
     :return: A list of output variables which will be passed to next stage
     '''
-    this_operator = scope.declare_local_operator(_get_lightgbm_operator_name(type(model)), model)
+    operator_name = _get_lightgbm_operator_name(model)
+    this_operator = scope.declare_local_operator(operator_name, model)
     this_operator.inputs = inputs
 
-    if type(model) in lightgbm_classifier_list:
+    if operator_name == 'LgbmClassifier':
         # For classifiers, we may have two outputs, one for label and the other one for probabilities of all classes.
         # Notice that their types here are not necessarily correct and they will be fixed in shape inference phase
         label_variable = scope.declare_local_variable('label', FloatTensorType())
diff --git a/onnxmltools/convert/lightgbm/convert.py b/onnxmltools/convert/lightgbm/convert.py
@@ -5,6 +5,10 @@
 # --------------------------------------------------------------------------
 
 from uuid import uuid4
+
+import lightgbm
+
+from onnxmltools.convert.lightgbm._parse import WrappedBooster
 from ...proto import onnx, get_opset_number_from_onnx
 from ..common._topology import convert_topology
 from ._parse import parse_lightgbm
@@ -21,10 +25,11 @@ def convert(model, name=None, initial_types=None, doc_string='', target_opset=No
     This function produces an equivalent ONNX model of the given lightgbm model.
     The supported lightgbm modules are listed below.
     
-    * `LGBMClassifiers <http://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier>`_
-    * `LGBMRegressor <http://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMRegressor>`_
+    * `LGBMClassifiers <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_
+    * `LGBMRegressor <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html>`_
+    * `Booster <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html>`_
 
-    :param model: A lightgbm model
+    :param model: A LightGBM model
     :param initial_types: a python list. Each element is a tuple of a variable name and a type defined in data_types.py
     :param name: The name of the graph (type: GraphProto) in the produced ONNX model (type: ModelProto)
     :param doc_string: A string attached onto the produced ONNX model
@@ -36,8 +41,10 @@ def convert(model, name=None, initial_types=None, doc_string='', target_opset=No
     :return: An ONNX model (type: ModelProto) which is equivalent to the input lightgbm model
     '''
     if initial_types is None:
-        raise ValueError('Initial types are required. See usage of convert(...) in \
-                           onnxmltools.convert.lightgbm.convert for details')
+        raise ValueError('Initial types are required. See usage of convert(...) in '
+                         'onnxmltools.convert.lightgbm.convert for details')
+    if isinstance(model, lightgbm.Booster):
+        model = WrappedBooster(model)
     if name is None:
         name = str(uuid4().hex)
 
diff --git a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py
@@ -70,8 +70,10 @@ def _parse_tree_structure(tree_id, class_id, learning_rate, tree_structure, attr
     else:
         attrs['nodes_missing_value_tracks_true'].append(0)
     attrs['nodes_hitrates'].append(1.)
-    _parse_node(tree_id, class_id, left_id, node_id_pool, learning_rate, tree_structure['left_child'], attrs)
-    _parse_node(tree_id, class_id, right_id, node_id_pool, learning_rate, tree_structure['right_child'], attrs)
+    _parse_node(tree_id, class_id, left_id, node_id_pool, learning_rate,
+                tree_structure['left_child'], attrs)
+    _parse_node(tree_id, class_id, right_id, node_id_pool, learning_rate,
+                tree_structure['right_child'], attrs)
 
 
 def _parse_node(tree_id, class_id, node_id, node_id_pool, learning_rate, node, attrs):
@@ -97,8 +99,10 @@ def _parse_node(tree_id, class_id, node_id, node_id_pool, learning_rate, node, a
         attrs['nodes_hitrates'].append(1.)
 
         # Recursively dive into the child nodes
-        _parse_node(tree_id, class_id, left_id, node_id_pool, learning_rate, node['left_child'], attrs)
-        _parse_node(tree_id, class_id, right_id, node_id_pool, learning_rate, node['right_child'], attrs)
+        _parse_node(tree_id, class_id, left_id, node_id_pool, learning_rate, node['left_child'],
+                    attrs)
+        _parse_node(tree_id, class_id, right_id, node_id_pool, learning_rate, node['right_child'],
+                    attrs)
     elif hasattr(node, 'left_child') or hasattr(node, 'right_child'):
         raise ValueError('Need two branches')
     else:
@@ -130,19 +134,20 @@ def convert_lightgbm(scope, operator, container):
 
     attrs = get_default_tree_classifier_attribute_pairs()
     attrs['name'] = operator.full_name
-    
+
     # Create different attributes for classifier and regressor, respectively
-    if isinstance(gbm_model, LGBMClassifier):
+    if gbm_text['objective'].startswith('binary'):
+        n_classes = 1
+        attrs['post_transform'] = 'LOGISTIC'
+    elif gbm_text['objective'].startswith('multiclass'):
         n_classes = gbm_text['num_class']
-        if gbm_model.objective_ == 'multiclass':
-            attrs['post_transform'] = 'SOFTMAX'
-        else:
-            attrs['post_transform'] = 'LOGISTIC'
-    else:
+        attrs['post_transform'] = 'SOFTMAX'
+    elif gbm_text['objective'].startswith('regression'):
         n_classes = 1  # Regressor has only one output variable
         attrs['post_transform'] = 'NONE'
         attrs['n_targets'] = n_classes
-
+    else:
+        assert False, 'LightGBM objective should be cleaned already'
     # Use the same algorithm to parse the tree
     for i, tree in enumerate(gbm_text['tree_info']):
         tree_id = i
@@ -156,7 +161,8 @@ def convert_lightgbm(scope, operator, container):
     tree_number = len(node_numbers_per_tree.keys())
     accumulated_node_numbers = [0] * tree_number
     for i in range(1, tree_number):
-        accumulated_node_numbers[i] = accumulated_node_numbers[i - 1] + node_numbers_per_tree[i - 1]
+        accumulated_node_numbers[i] = (accumulated_node_numbers[i - 1]
+                                       + node_numbers_per_tree[i - 1])
     global_node_indexes = []
     for i in range(len(attrs['nodes_nodeids'])):
         tree_id = attrs['nodes_treeids'][i]
@@ -169,7 +175,8 @@ def convert_lightgbm(scope, operator, container):
             attrs[k] = sorted_list
 
     # Create ONNX object
-    if isinstance(gbm_model, LGBMClassifier):
+    if (gbm_text['objective'].startswith('binary')
+            or gbm_text['objective'].startswith('multiclass')):
         # Prepare label information for both of TreeEnsembleClassifier and ZipMap
         class_type = onnx_proto.TensorProto.STRING
         zipmap_attrs = {'name': scope.get_unique_variable_name('ZipMap')}
diff --git a/onnxmltools/utils/tests_helper.py b/onnxmltools/utils/tests_helper.py
@@ -75,7 +75,19 @@ def dump_data_and_model(data, model, onnx=None, basename="model", folder=None,
         os.makedirs(folder)
 
     if hasattr(model, "predict"):
-        if hasattr(model, "predict_proba"):
+        import lightgbm
+        if isinstance(model, lightgbm.Booster):
+            # LightGBM Booster
+            model_dict = model.dump_model()
+            if model_dict['objective'].startswith('binary'):
+                score = model.predict(data)
+                prediction = [score > 0.5, numpy.vstack([1-score, score]).T]
+            elif model_dict['objective'].startswith('multiclass'):
+                score = model.predict(data)
+                prediction = [score.argmax(axis=1), score]
+            else:
+                prediction = [model.predict(data)]
+        elif hasattr(model, "predict_proba"):
             # Classifier
             prediction = [model.predict(data), model.predict_proba(data)]
         elif hasattr(model, "decision_function"):
@@ -172,6 +184,13 @@ def convert_model(model, name, input_types):
     elif model.__class__.__name__.startswith("XGB"):
         from onnxmltools.convert import convert_xgboost
         model, prefix = convert_xgboost(model, name, input_types), "XGB"
+    elif model.__class__.__name__ == 'Booster':
+        import lightgbm
+        if isinstance(model, lightgbm.Booster):
+            from onnxmltools.convert import convert_lightgbm
+            model, prefix = convert_lightgbm(model, name, input_types), "LightGbm"
+        else:
+            raise RuntimeError("Unable to convert model of type '{0}'.".format(type(model)))
     elif isinstance(model, BaseEstimator):
         from onnxmltools.convert import convert_sklearn
         model, prefix = convert_sklearn(model, name, input_types), "Sklearn"
diff --git a/onnxmltools/utils/utils_backend_onnxruntime.py b/onnxmltools/utils/utils_backend_onnxruntime.py
@@ -4,6 +4,8 @@
 import os
 import glob
 import pickle
+import warnings
+
 import numpy
 from numpy.testing import assert_array_almost_equal, assert_array_equal
 from .utils_backend import load_data_and_model, extract_options, ExpectedAssertionError, OnnxRuntimeAssertionError, compare_outputs
diff --git a/tests/lightgbm/test_LightGbmTreeEnsembleConverters.py b/tests/lightgbm/test_LightGbmTreeEnsembleConverters.py
@@ -5,13 +5,15 @@
 # --------------------------------------------------------------------------
 
 import unittest
+
+import lightgbm
 import numpy
 from lightgbm import LGBMClassifier, LGBMRegressor
-from onnxmltools import convert_lightgbm
 from onnxmltools.convert.common.data_types import FloatTensorType
 from onnxmltools.utils import dump_data_and_model
 from onnxmltools.utils import dump_binary_classification, dump_multiple_classification
-from onnxmltools.utils import dump_multiple_regression, dump_single_regression
+from onnxmltools.utils import dump_single_regression
+from onnxmltools.utils.tests_helper import convert_model
 
 
 class TestLightGbmTreeEnsembleModels(unittest.TestCase):
@@ -33,6 +35,33 @@ def test_lightgbm_regressor2(self):
         model = LGBMRegressor(n_estimators=2, max_depth=1, min_child_samples=1)
         dump_single_regression(model, suffix="2")
 
+    def test_lightgbm_booster_classifier(self):
+        X = [[0, 1], [1, 1], [2, 0], [1, 2]]
+        X = numpy.array(X, dtype=numpy.float32)
+        y = [0, 1, 0, 1]
+        data = lightgbm.Dataset(X, label=y)
+        model = lightgbm.train({'boosting_type': 'gbdt', 'objective': 'binary',
+                                'n_estimators': 3, 'min_child_samples': 1},
+                               data)
+        model_onnx, prefix = convert_model(model, 'tree-based multi-output classifier',
+                                           [('input', FloatTensorType([1, 2]))])
+        dump_data_and_model(X, model, model_onnx,
+                            allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
+                            basename=prefix + "BoosterBin" + model.__class__.__name__)
+
+    def test_lightgbm_booster_regressor(self):
+        X = [[0, 1], [1, 1], [2, 0]]
+        X = numpy.array(X, dtype=numpy.float32)
+        y = [0, 1, 1.1]
+        data = lightgbm.Dataset(X, label=y)
+        model = lightgbm.train({'boosting_type': 'gbdt', 'objective': 'regression',
+                                'n_estimators': 3, 'min_child_samples': 1, 'max_depth': 1},
+                               data)
+        model_onnx, prefix = convert_model(model, 'tree-based binary classifier',
+                                           [('input', FloatTensorType([1, 2]))])
+        dump_data_and_model(X, model, model_onnx,
+                            basename=prefix + "BoosterBin" + model.__class__.__name__)
+
 
 if __name__ == "__main__":
     unittest.main()