Limit the number of leaves in each subroutine for gradient boosted trees (#123)

izeigerman · web-flow · commit 65c07eb4bcc0 · 2019-11-30T12:40:40.000-08:00
Fixes #103
diff --git a/m2cgen/assemblers/boosting.py b/m2cgen/assemblers/boosting.py
@@ -9,10 +9,12 @@ class BaseBoostingAssembler(ModelAssembler):
 
     classifier_name = None
 
-    def __init__(self, model, trees, base_score=0, tree_limit=None):
+    def __init__(self, model, trees, base_score=0, tree_limit=None,
+                 leaves_cutoff_threshold=3000):
         super().__init__(model)
         self.all_trees = trees
         self._base_score = base_score
+        self._leaves_cutoff_threshold = leaves_cutoff_threshold
 
         self._output_size = 1
         self._is_classification = False
@@ -41,10 +43,19 @@ def _assemble_single_output(self, trees, base_score=0):
             trees = trees[:self._tree_limit]
 
         trees_ast = [self._assemble_tree(t) for t in trees]
+        to_sum = trees_ast
+
+        # In a large tree we need to generate multiple subroutines to avoid
+        # java limitations https://github.com/BayesWitnesses/m2cgen/issues/103.
+        trees_num_leaves = [self._count_leaves(t) for t in trees]
+        if sum(trees_num_leaves) > self._leaves_cutoff_threshold:
+            to_sum = self._split_into_subroutines(trees_ast, trees_num_leaves)
+
         result_ast = utils.apply_op_to_expressions(
             ast.BinNumOpType.ADD,
             ast.NumVal(base_score),
-            *trees_ast)
+            *to_sum)
+
         return ast.SubroutineExpr(result_ast)
 
     def _assemble_multi_class_output(self, trees):
@@ -74,15 +85,47 @@ def _assemble_bin_class_output(self, trees):
             proba_expr
         ])
 
+    def _split_into_subroutines(self, trees_ast, trees_num_leaves):
+        result = []
+        subroutine_trees = []
+        subroutine_sum_leaves = 0
+        for tree, num_leaves in zip(trees_ast, trees_num_leaves):
+            next_sum = subroutine_sum_leaves + num_leaves
+            if subroutine_trees and next_sum > self._leaves_cutoff_threshold:
+                # Exceeded the max leaves in the current subroutine,
+                # finalize this one and start a new one.
+                partial_result = utils.apply_op_to_expressions(
+                    ast.BinNumOpType.ADD,
+                    *subroutine_trees)
+
+                result.append(ast.SubroutineExpr(partial_result))
+
+                subroutine_trees = []
+                subroutine_sum_leaves = 0
+
+            subroutine_sum_leaves += num_leaves
+            subroutine_trees.append(tree)
+
+        if subroutine_trees:
+            partial_result = utils.apply_op_to_expressions(
+                ast.BinNumOpType.ADD,
+                *subroutine_trees)
+            result.append(ast.SubroutineExpr(partial_result))
+        return result
+
     def _assemble_tree(self, tree):
         raise NotImplementedError
 
+    @staticmethod
+    def _count_leaves(trees):
+        raise NotImplementedError
+
 
 class XGBoostModelAssembler(BaseBoostingAssembler):
 
     classifier_name = "XGBClassifier"
 
-    def __init__(self, model):
+    def __init__(self, model, leaves_cutoff_threshold=3000):
         feature_names = model.get_booster().feature_names
         self._feature_name_to_idx = {
             name: idx for idx, name in enumerate(feature_names or [])
@@ -96,7 +139,8 @@ def __init__(self, model):
         best_ntree_limit = getattr(model, "best_ntree_limit", None)
 
         super().__init__(model, trees, base_score=model.base_score,
-                         tree_limit=best_ntree_limit)
+                         tree_limit=best_ntree_limit,
+                         leaves_cutoff_threshold=leaves_cutoff_threshold)
 
     def _assemble_tree(self, tree):
         if "leaf" in tree:
@@ -130,16 +174,31 @@ def _assemble_child_tree(self, tree, child_id):
                 return self._assemble_tree(child)
         assert False, "Unexpected child ID {}".format(child_id)
 
+    @staticmethod
+    def _count_leaves(tree):
+        queue = [tree]
+        num_leaves = 0
+
+        while queue:
+            tree = queue.pop()
+            if "leaf" in tree:
+                num_leaves += 1
+            elif "children" in tree:
+                for child in tree["children"]:
+                    queue.append(child)
+        return num_leaves
+
 
 class LightGBMModelAssembler(BaseBoostingAssembler):
 
     classifier_name = "LGBMClassifier"
 
-    def __init__(self, model):
+    def __init__(self, model, leaves_cutoff_threshold=3000):
         model_dump = model.booster_.dump_model()
         trees = [m["tree_structure"] for m in model_dump["tree_info"]]
 
-        super().__init__(model, trees)
+        super().__init__(model, trees,
+                         leaves_cutoff_threshold=leaves_cutoff_threshold)
 
     def _assemble_tree(self, tree):
         if "leaf_value" in tree:
@@ -151,9 +210,9 @@ def _assemble_tree(self, tree):
         op = ast.CompOpType.from_str_op(tree["decision_type"])
         assert op == ast.CompOpType.LTE, "Unexpected comparison op"
 
-        # Make sure that if the 'default_left' is true the left tree branch
+        # Make sure that if the "default_left" is true the left tree branch
         # ends up in the "else" branch of the ast.IfExpr.
-        if tree['default_left']:
+        if tree["default_left"]:
             op = ast.CompOpType.GT
             true_child = tree["right_child"]
             false_child = tree["left_child"]
@@ -166,6 +225,20 @@ def _assemble_tree(self, tree):
             self._assemble_tree(true_child),
             self._assemble_tree(false_child))
 
+    @staticmethod
+    def _count_leaves(tree):
+        queue = [tree]
+        num_leaves = 0
+
+        while queue:
+            tree = queue.pop()
+            if "leaf_value" in tree:
+                num_leaves += 1
+            else:
+                queue.append(tree["left_child"])
+                queue.append(tree["right_child"])
+        return num_leaves
+
 
 def _split_trees_by_classes(trees, n_classes):
     # Splits are computed based on a comment
diff --git a/tests/assemblers/test_lightgbm.py b/tests/assemblers/test_lightgbm.py
@@ -110,3 +110,53 @@ def test_regression():
             ast.BinNumOpType.ADD))
 
     assert utils.cmp_exprs(actual, expected)
+
+
+def test_leaves_cutoff_threshold():
+    estimator = lightgbm.LGBMClassifier(n_estimators=2, random_state=1,
+                                        max_depth=1)
+    utils.train_model_classification_binary(estimator)
+
+    assembler = assemblers.LightGBMModelAssembler(estimator,
+                                                  leaves_cutoff_threshold=1)
+    actual = assembler.assemble()
+
+    sigmoid = ast.BinNumExpr(
+        ast.NumVal(1),
+        ast.BinNumExpr(
+            ast.NumVal(1),
+            ast.ExpExpr(
+                ast.BinNumExpr(
+                    ast.NumVal(0),
+                    ast.SubroutineExpr(
+                        ast.BinNumExpr(
+                            ast.BinNumExpr(
+                                ast.NumVal(0),
+                                ast.SubroutineExpr(
+                                    ast.IfExpr(
+                                        ast.CompExpr(
+                                            ast.FeatureRef(23),
+                                            ast.NumVal(868.2000000000002),
+                                            ast.CompOpType.GT),
+                                        ast.NumVal(0.2762557140263451),
+                                        ast.NumVal(0.6399134166614473))),
+                                ast.BinNumOpType.ADD),
+                            ast.SubroutineExpr(
+                                ast.IfExpr(
+                                    ast.CompExpr(
+                                        ast.FeatureRef(27),
+                                        ast.NumVal(0.14205000000000004),
+                                        ast.CompOpType.GT),
+                                    ast.NumVal(-0.2139321843285849),
+                                    ast.NumVal(0.1151466338793227))),
+                            ast.BinNumOpType.ADD)),
+                    ast.BinNumOpType.SUB)),
+            ast.BinNumOpType.ADD),
+        ast.BinNumOpType.DIV,
+        to_reuse=True)
+
+    expected = ast.VectorVal([
+        ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
+        sigmoid])
+
+    assert utils.cmp_exprs(actual, expected)
diff --git a/tests/assemblers/test_xgboost.py b/tests/assemblers/test_xgboost.py
@@ -268,3 +268,53 @@ def test_regression_saved_without_feature_names():
             ast.BinNumOpType.ADD))
 
     assert utils.cmp_exprs(actual, expected)
+
+
+def test_leaves_cutoff_threshold():
+    estimator = xgboost.XGBClassifier(n_estimators=2, random_state=1,
+                                      max_depth=1)
+    utils.train_model_classification_binary(estimator)
+
+    assembler = assemblers.XGBoostModelAssembler(estimator,
+                                                 leaves_cutoff_threshold=1)
+    actual = assembler.assemble()
+
+    sigmoid = ast.BinNumExpr(
+        ast.NumVal(1),
+        ast.BinNumExpr(
+            ast.NumVal(1),
+            ast.ExpExpr(
+                ast.BinNumExpr(
+                    ast.NumVal(0),
+                    ast.SubroutineExpr(
+                        ast.BinNumExpr(
+                            ast.BinNumExpr(
+                                ast.NumVal(-0.0),
+                                ast.SubroutineExpr(
+                                    ast.IfExpr(
+                                        ast.CompExpr(
+                                            ast.FeatureRef(20),
+                                            ast.NumVal(16.7950001),
+                                            ast.CompOpType.GTE),
+                                        ast.NumVal(-0.17062147),
+                                        ast.NumVal(0.1638484))),
+                                ast.BinNumOpType.ADD),
+                            ast.SubroutineExpr(
+                                ast.IfExpr(
+                                    ast.CompExpr(
+                                        ast.FeatureRef(27),
+                                        ast.NumVal(0.142349988),
+                                        ast.CompOpType.GTE),
+                                    ast.NumVal(-0.16087772),
+                                    ast.NumVal(0.149866998))),
+                            ast.BinNumOpType.ADD)),
+                    ast.BinNumOpType.SUB)),
+            ast.BinNumOpType.ADD),
+        ast.BinNumOpType.DIV,
+        to_reuse=True)
+
+    expected = ast.VectorVal([
+        ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
+        sigmoid])
+
+    assert utils.cmp_exprs(actual, expected)
diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py
@@ -51,6 +51,30 @@ def classification_binary(model):
     )
 
 
+def regression_random(model):
+    return (
+        model,
+        utils.train_model_regression_random_data,
+        REGRESSION,
+    )
+
+
+def classification_random(model):
+    return (
+        model,
+        utils.train_model_classification_random_data,
+        CLASSIFICATION,
+    )
+
+
+def classification_binary_random(model):
+    return (
+        model,
+        utils.train_model_classification_binary_random_data,
+        CLASSIFICATION,
+    )
+
+
 # Absolute tolerance. Used in np.isclose to compare 2 values.
 # We compare 6 decimal digits.
 ATOL = 1.e-6
@@ -63,6 +87,11 @@ def classification_binary(model):
 LIGHT_GBM_PARAMS = dict(n_estimators=10, random_state=RANDOM_SEED)
 SVC_PARAMS = dict(random_state=RANDOM_SEED, decision_function_shape="ovo")
 
+XGBOOST_PARAMS_LARGE = dict(base_score=0.6, n_estimators=100, max_depth=12,
+                            random_state=RANDOM_SEED)
+LIGHT_GBM_PARAMS_LARGE = dict(n_estimators=100, num_leaves=100, max_depth=64,
+                              random_state=RANDOM_SEED)
+
 
 @utils.cartesian_e2e_params(
     # These are the languages which support all models specified in the
@@ -85,11 +114,27 @@ def classification_binary(model):
         classification(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)),
         classification_binary(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)),
 
+        # LightGBM (Large Trees)
+        regression_random(
+            lightgbm.LGBMRegressor(**LIGHT_GBM_PARAMS_LARGE)),
+        classification_random(
+            lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)),
+        classification_binary_random(
+            lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)),
+
         # XGBoost
         regression(xgboost.XGBRegressor(**XGBOOST_PARAMS)),
         classification(xgboost.XGBClassifier(**XGBOOST_PARAMS)),
         classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS)),
 
+        # XGBoost (Large Trees)
+        regression_random(
+            xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)),
+        classification_random(
+            xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
+        classification_binary_random(
+            xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
+
         # Linear SVM
         regression(svm.LinearSVR(random_state=RANDOM_SEED)),
         classification(svm.LinearSVC(random_state=RANDOM_SEED)),
diff --git a/tests/utils.py b/tests/utils.py