Skip to content

Commit 362444a

Browse files
honzasterbaWenbing Li
authored andcommitted
Add support for H2O GBM MOJO (#358)
1 parent 37e51ab commit 362444a

File tree

17 files changed

+3393
-3
lines changed

17 files changed

+3393
-3
lines changed

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ ONNXMLTools enables you to convert models from different machine learning toolki
1515
* LightGBM
1616
* libsvm
1717
* XGBoost
18+
* H2O
1819

1920
## Install
2021
You can install latest release of ONNXMLTools from [PyPi](https://pypi.org/project/onnxmltools/):
@@ -28,14 +29,15 @@ pip install git+https://github.com/onnx/onnxmltools
2829
If you choose to install `onnxmltools` from its source code, you must set the environment variable `ONNX_ML=1` before installing the `onnx` package.
2930

3031
## Dependencies
31-
This package relies on ONNX, NumPy, and ProtoBuf. If you are converting a model from scikit-learn, Core ML, Keras, LightGBM, SparkML, XGBoost, or LibSVM, you will need an environment with the respective package installed from the list below:
32+
This package relies on ONNX, NumPy, and ProtoBuf. If you are converting a model from scikit-learn, Core ML, Keras, LightGBM, SparkML, XGBoost, H2O or LibSVM, you will need an environment with the respective package installed from the list below:
3233
1. scikit-learn
3334
2. CoreMLTools
3435
3. Keras (version 2.0.8 or higher) with the corresponding Tensorflow version
3536
4. LightGBM (scikit-learn interface)
3637
5. SparkML
3738
6. XGBoost (scikit-learn interface)
3839
7. libsvm
40+
8. H2O
3941

4042
ONNXMLTools has been tested with Python **2.7**, **3.5**, **3.6**, and **3.7**.
4143
`Note: some wrapped converters may not support python 2.x anymore.`
@@ -98,6 +100,19 @@ Please refer to the following documents:
98100
* [Conversion Framework](onnxmltools/convert/README.md)
99101
* [Spark ML to ONNX Model Conversion](onnxmltools/convert/sparkml/README.md)
100102

103+
## H2O to ONNX Conversion
104+
Below is a code snippet to convert a H2O MOJO model into an ONNX model. The only pre-requisity is to have a MOJO model saved on the local file-system.
105+
106+
```python
107+
import onnxmltools
108+
109+
# Convert the Core ML model into ONNX
110+
onnx_model = onnxmltools.convert_h2o('/path/to/h2o/gbm_mojo.zip')
111+
112+
# Save as protobuf
113+
onnxmltools.utils.save_model(onnx_model, 'h2o_gbm.onnx')
114+
```
115+
101116
# Testing model converters
102117

103118
*onnxmltools* converts models into the ONNX format which

onnxmltools/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from .convert import convert_sparkml
2525
from .convert import convert_tensorflow
2626
from .convert import convert_xgboost
27+
from .convert import convert_h2o
2728

2829
from .utils import load_model
2930
from .utils import save_model

onnxmltools/convert/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@
1212
from .main import convert_sparkml
1313
from .main import convert_tensorflow
1414
from .main import convert_xgboost
15+
from .main import convert_h2o

onnxmltools/convert/common/_container.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ class LightGbmModelContainer(CommonSklearnModelContainer):
1717
class XGBoostModelContainer(CommonSklearnModelContainer):
1818
pass
1919

20+
class H2OModelContainer(CommonSklearnModelContainer):
21+
pass
2022

2123
class SparkmlModelContainer(RawModelContainer):
2224

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
7+
from .convert import convert

onnxmltools/convert/h2o/_parse.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
7+
from ..common._container import H2OModelContainer
8+
from ..common._topology import Topology, FloatTensorType
9+
10+
def _parse_h2o(scope, model, inputs):
11+
'''
12+
:param scope: Scope object
13+
:param model: A h2o model data object
14+
:param inputs: A list of variables
15+
:return: A list of output variables which will be passed to next stage
16+
'''
17+
this_operator = scope.declare_local_operator("H2OTreeMojo", model)
18+
this_operator.inputs = inputs
19+
20+
if model["params"]["classifier"]:
21+
label_variable = scope.declare_local_variable('label', FloatTensorType())
22+
probability_map_variable = scope.declare_local_variable('probabilities', FloatTensorType())
23+
this_operator.outputs.append(label_variable)
24+
this_operator.outputs.append(probability_map_variable)
25+
else:
26+
variable = scope.declare_local_variable('variable', FloatTensorType())
27+
this_operator.outputs.append(variable)
28+
return this_operator.outputs
29+
30+
31+
def parse_h2o(model, initial_types=None, target_opset=None,
32+
custom_conversion_functions=None, custom_shape_calculators=None):
33+
34+
raw_model_container = H2OModelContainer(model)
35+
topology = Topology(raw_model_container, default_batch_size='None',
36+
initial_types=initial_types, target_opset=target_opset,
37+
custom_conversion_functions=custom_conversion_functions,
38+
custom_shape_calculators=custom_shape_calculators)
39+
scope = topology.declare_scope('__root__')
40+
41+
inputs = []
42+
for var_name, initial_type in initial_types:
43+
inputs.append(scope.declare_local_variable(var_name, initial_type))
44+
45+
for variable in inputs:
46+
raw_model_container.add_input(variable)
47+
48+
outputs = _parse_h2o(scope, model, inputs)
49+
50+
for variable in outputs:
51+
raw_model_container.add_output(variable)
52+
53+
return topology

onnxmltools/convert/h2o/convert.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
7+
from uuid import uuid4
8+
import json
9+
import tempfile
10+
import h2o
11+
12+
from ...proto import onnx, get_opset_number_from_onnx
13+
from ..common._topology import convert_topology
14+
from ..common.data_types import FloatTensorType
15+
from ._parse import parse_h2o
16+
17+
# Invoke the registration of all our converters and shape calculators
18+
from . import operator_converters, shape_calculators
19+
20+
21+
def convert(model, name=None, initial_types=None, doc_string='', target_opset=None,
22+
targeted_onnx=onnx.__version__, custom_conversion_functions=None,
23+
custom_shape_calculators=None):
24+
'''
25+
This function produces an equivalent ONNX model of the given H2O MOJO model.
26+
Supported model types:
27+
- GBM, with limitations:
28+
- poisson, gamma, tweedie distributions not supported
29+
- multinomial distribution supported with 3 or more classes (use binomial otherwise)
30+
Ohter limitations:
31+
- modes with categorical splits not supported
32+
33+
34+
:param model: H2O MOJO model loaded into memory (see below for example)
35+
:param name: The name of the graph (type: GraphProto) in the produced ONNX model (type: ModelProto)
36+
:param initial_types: a python list. Each element is a tuple of a variable name and a type defined in data_types.py
37+
:param doc_string: A string attached onto the produced ONNX model
38+
:param target_opset: number, for example, 7 for ONNX 1.2, and 8 for ONNX 1.3.
39+
:param targeted_onnx: A string (for example, '1.1.2' and '1.2') used to specify the targeted ONNX version of the
40+
produced model. If ONNXMLTools cannot find a compatible ONNX python package, an error may be thrown.
41+
:param custom_conversion_functions: a dictionary for specifying the user customized conversion function
42+
:param custom_shape_calculators: a dictionary for specifying the user customized shape calculator
43+
:return: An ONNX model (type: ModelProto) which is equivalent to the input xgboost model
44+
45+
:examples:
46+
47+
>>> from onnxmltools.convert import convert_h2o
48+
>>> file = open("/path/to/h2o_mojo.zip", "rb")
49+
>>> mojo_content = file.read()
50+
>>> file.close()
51+
>>> h2o_onnx_model = convert_h2o(mojo_content)
52+
'''
53+
if name is None:
54+
name = str(uuid4().hex)
55+
if initial_types is None:
56+
initial_types = [('input', FloatTensorType(shape=['None', 'None']))]
57+
58+
_, model_path = tempfile.mkstemp()
59+
f = open(model_path, "wb")
60+
f.write(model)
61+
f.close()
62+
mojo_str = h2o.print_mojo(model_path, format="json")
63+
mojo_model = json.loads(mojo_str)
64+
if mojo_model["params"]["algo"] != "gbm":
65+
raise ValueError("Model type not supported (algo=%s). Only GBM Mojo supported for now." % mojo_model["params"]["algo"])
66+
67+
target_opset = target_opset if target_opset else get_opset_number_from_onnx()
68+
topology = parse_h2o(mojo_model, initial_types, target_opset, custom_conversion_functions, custom_shape_calculators)
69+
topology.compile()
70+
onnx_model = convert_topology(topology, name, doc_string, target_opset, targeted_onnx)
71+
return onnx_model
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
7+
# To register converter for scikit-learn operators, import associated modules here.
8+
from . import h2o
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
7+
from ...common._registration import register_converter
8+
9+
_LINK_FUNCTION_TO_POST_TRANSFORM = {
10+
'identity': 'NONE',
11+
'logit': 'LOGISTIC',
12+
'ologit': 'LOGISTIC'
13+
}
14+
15+
16+
def _get_post_transform(params):
17+
link_function = params["link_function"]
18+
family = params["family"]
19+
if family == "multinomial":
20+
return 'SOFTMAX'
21+
elif link_function not in _LINK_FUNCTION_TO_POST_TRANSFORM.keys():
22+
raise ValueError("Link function %s not supported." % link_function)
23+
else:
24+
return _LINK_FUNCTION_TO_POST_TRANSFORM[link_function]
25+
26+
27+
def _get_default_tree_attribute_pairs(is_classifier, params):
28+
attrs = {
29+
'post_transform': _get_post_transform(params)
30+
}
31+
nclasses = params["nclasses"]
32+
if is_classifier:
33+
predicted_classes = nclasses if nclasses > 2 else 1
34+
attrs['base_values'] = [params["base_score"] for _ in range(0, predicted_classes)]
35+
else:
36+
attrs['n_targets'] = 1
37+
attrs['base_values'] = [params["base_score"]]
38+
for k in {'nodes_treeids', 'nodes_nodeids',
39+
'nodes_featureids', 'nodes_modes', 'nodes_values',
40+
'nodes_truenodeids', 'nodes_falsenodeids', 'nodes_missing_value_tracks_true'}:
41+
attrs[k] = []
42+
node_attr_prefix = _node_attr_prefix(is_classifier)
43+
for k in {'_treeids', '_nodeids', '_ids', '_weights'}:
44+
attrs[node_attr_prefix + k] = []
45+
return attrs
46+
47+
48+
def _add_node(
49+
attr_pairs, is_classifier, tree_id, node_id,
50+
feature_id, mode, value, true_child_id, false_child_id, weights,
51+
missing
52+
):
53+
attr_pairs['nodes_treeids'].append(tree_id)
54+
attr_pairs['nodes_nodeids'].append(node_id)
55+
attr_pairs['nodes_featureids'].append(feature_id)
56+
attr_pairs['nodes_modes'].append(mode)
57+
attr_pairs['nodes_values'].append(float(value))
58+
attr_pairs['nodes_truenodeids'].append(true_child_id)
59+
attr_pairs['nodes_falsenodeids'].append(false_child_id)
60+
attr_pairs['nodes_missing_value_tracks_true'].append(missing)
61+
if mode == 'LEAF':
62+
node_attr_prefix = _node_attr_prefix(is_classifier)
63+
for i, w in enumerate(weights):
64+
attr_pairs[node_attr_prefix + '_treeids'].append(tree_id)
65+
attr_pairs[node_attr_prefix + '_nodeids'].append(node_id)
66+
attr_pairs[node_attr_prefix + '_ids'].append(i)
67+
attr_pairs[node_attr_prefix + '_weights'].append(float(w))
68+
69+
70+
def _node_attr_prefix(is_classifier):
71+
return "class" if is_classifier else "target"
72+
73+
74+
def _fill_node_attributes(tree_id, node, attr_pairs, is_classifier):
75+
if 'leftChild' in node:
76+
if node["isCategorical"]:
77+
raise ValueError("categorical splits not supported, use one_hot_explicit")
78+
else:
79+
operator = 'BRANCH_GTE'
80+
value = node['splitValue']
81+
_add_node(
82+
attr_pairs=attr_pairs,
83+
is_classifier=is_classifier,
84+
tree_id=tree_id,
85+
mode=operator,
86+
value=value,
87+
node_id=node['id'],
88+
feature_id=node['colId'],
89+
true_child_id=node['rightChild']['id'],
90+
false_child_id=node['leftChild']['id'],
91+
weights=None,
92+
missing=(0 if node["leftward"] else 1),
93+
)
94+
_fill_node_attributes(tree_id, node["leftChild"], attr_pairs, is_classifier)
95+
_fill_node_attributes(tree_id, node["rightChild"], attr_pairs, is_classifier)
96+
else: # leaf
97+
weights = [node['predValue']]
98+
_add_node(
99+
attr_pairs=attr_pairs,
100+
is_classifier=is_classifier,
101+
tree_id=tree_id,
102+
value=0.,
103+
node_id=node['id'],
104+
feature_id=0, mode='LEAF',
105+
true_child_id=0, false_child_id=0,
106+
weights=weights,
107+
missing=False
108+
)
109+
110+
111+
def assign_node_ids(node, next_id):
112+
if node is None:
113+
return next_id
114+
node["id"] = next_id
115+
next_id += 1
116+
next_id = assign_node_ids(node.get("leftChild", None), next_id)
117+
return assign_node_ids(node.get("rightChild", None), next_id)
118+
119+
120+
def fill_tree_attributes(model, attr_pairs, node_attr_prefix):
121+
for tree in model["trees"]:
122+
assign_node_ids(tree["root"], 0)
123+
_fill_node_attributes(tree["index"], tree["root"], attr_pairs, node_attr_prefix)
124+
125+
126+
def convert_regression(scope, operator, container, params):
127+
model = operator.raw_operator
128+
129+
attr_pairs = _get_default_tree_attribute_pairs(False, params)
130+
fill_tree_attributes(model, attr_pairs, False)
131+
132+
# add nodes
133+
container.add_node('TreeEnsembleRegressor', operator.input_full_names,
134+
operator.output_full_names, op_domain='ai.onnx.ml',
135+
name=scope.get_unique_operator_name('TreeEnsembleRegressor'), **attr_pairs)
136+
137+
138+
def convert_classifier(scope, operator, container, params):
139+
if params["family"] == "multinomial" and params["nclasses"] == 2:
140+
raise ValueError("Multinomial distribution with two classes not supported, use binomial distribution.")
141+
model = operator.raw_operator
142+
143+
attr_pairs = _get_default_tree_attribute_pairs(True, params)
144+
fill_tree_attributes(model, attr_pairs, True)
145+
146+
n_trees_in_group = params["n_trees_in_group"]
147+
attr_pairs['class_ids'] = [v % n_trees_in_group for v in attr_pairs['class_treeids']]
148+
attr_pairs['classlabels_strings'] = params["class_labels"]
149+
150+
container.add_node('TreeEnsembleClassifier', operator.input_full_names,
151+
operator.output_full_names,
152+
op_domain='ai.onnx.ml',
153+
name=scope.get_unique_operator_name('TreeEnsembleClassifier'),
154+
**attr_pairs)
155+
156+
157+
def convert_h2o(scope, operator, container):
158+
params = operator.raw_operator["params"]
159+
is_classifier = params["classifier"]
160+
if is_classifier:
161+
convert_classifier(scope, operator, container, params)
162+
else:
163+
convert_regression(scope, operator, container, params)
164+
165+
166+
register_converter('H2OTreeMojo', convert_h2o)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
7+
# To register shape calculators for lightgbm operators, import associated modules here.
8+
from . import h2otreemojo

0 commit comments

Comments
 (0)