onnx
diff --git a/‎README.md‎
Lines changed: 10 additions & 2 deletions b/‎README.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎onnxmltools/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎onnxmltools/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxmltools/convert/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎onnxmltools/convert/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxmltools/convert/common/_container.py‎
Lines changed: 27 additions & 0 deletions b/‎onnxmltools/convert/common/_container.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎onnxmltools/convert/common/utils.py‎
Lines changed: 10 additions & 0 deletions b/‎onnxmltools/convert/common/utils.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎onnxmltools/convert/main.py‎
Lines changed: 8 additions & 1 deletion b/‎onnxmltools/convert/main.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎onnxmltools/convert/sparkml/README.md‎
Lines changed: 79 additions & 0 deletions b/‎onnxmltools/convert/sparkml/README.md‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎onnxmltools/convert/sparkml/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎onnxmltools/convert/sparkml/__init__.py‎
Lines changed: 8 additions & 0 deletions
@@ -10,6 +10,7 @@ ONNXMLTools enables you to convert models from different machine learning toolki
 * Apple Core ML
 * scikit-learn (subset of models convertible to ONNX)
 * Keras
+* Spark ML (experimental)
 * LightGBM
 * libsvm
 
@@ -32,8 +33,9 @@ This package relies on ONNX, NumPy, and ProtoBuf. If you are converting a model
 2. CoreMLTools
 3. Keras (version 2.0.8 or higher) with the corresponding Tensorflow version
 4. LightGBM (scikit-learn interface)
-5. XGBoost (scikit-learn interface)
-6. libsvm
+5. SparkML (pyspark version 2.3.3 only)
+6. XGBoost (scikit-learn interface)
+7. libsvm
 
 # Examples
 If you want the converted ONNX model to be compatible with a certain ONNX version, please specify the target_opset parameter upon invoking the convert function. The following Keras model conversion example demonstrates this below. You can identify the mapping from ONNX Operator Sets (referred to as opsets) to ONNX releases in the [versioning documentation](https://github.com/onnx/onnx/blob/master/docs/Versioning.md#released-versions). 
@@ -91,6 +93,12 @@ keras_model = Model(inputs=[input1, input2], output=sub_sum)
 onnx_model = onnxmltools.convert_keras(keras_model, target_opset=7) 
 ```
 
+## Spark ML to ONNX Conversion
+Please refer to the following documents:
+ * [Conversion Framework](onnxmltools/README.md) and
+ * [Spark ML to Onnx Model Conversion](onnxmltools/convert/sparkml/README.md)
+ 
+
 # Testing model converters
 
 *onnxmltools* converts models into the ONNX format which
 
@@ -21,6 +21,7 @@
 from .convert import convert_keras
 from .convert import convert_lightgbm
 from .convert import convert_sklearn
+from .convert import convert_sparkml
 
 # from .convert.common.interface import *
 
 
@@ -9,5 +9,5 @@
 from .main import convert_libsvm
 from .main import convert_lightgbm
 from .main import convert_sklearn
+from .main import convert_sparkml
 from .main import convert_xgboost
-
@@ -39,6 +39,33 @@ def output_names(self):
         raise NotImplementedError()
 
 
+class SparkmlModelContainer(RawModelContainer):
+
+    def __init__(self, sparkml_model):
+        super(SparkmlModelContainer, self).__init__(sparkml_model)
+        # Sparkml models have no input and output specified, so we create them and store them in this container.
+        self._inputs = []
+        self._outputs = []
+
+    @property
+    def input_names(self):
+        return [variable.raw_name for variable in self._inputs]
+
+    @property
+    def output_names(self):
+        return [variable.raw_name for variable in self._outputs]
+
+    def add_input(self, variable):
+        # The order of adding variables matters. The final model's input names are sequentially added as this list
+        if variable not in self._inputs:
+            self._inputs.append(variable)
+
+    def add_output(self, variable):
+        # The order of adding variables matters. The final model's output names are sequentially added as this list
+        if variable not in self._outputs:
+            self._outputs.append(variable)
+
+
 class CoremlModelContainer(RawModelContainer):
 
     def __init__(self, coreml_model):
 
@@ -10,6 +10,16 @@
 from distutils.version import LooseVersion, StrictVersion
 
 
+def sparkml_installed():
+    """
+    Checks that *spark* is available.
+    """
+    try:
+        import pyspark
+        return True
+    except ImportError:
+        return False
+
 def sklearn_installed():
     """
     Checks that *scikit-learn* is available.
 
@@ -64,6 +64,14 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='', target_
     return convert_skl2onnx(model, name, initial_types, doc_string, target_opset,
                    custom_conversion_functions, custom_shape_calculators)
 
+def convert_sparkml(model, name=None, initial_types=None, doc_string='', target_opset=None,
+                    targeted_onnx=onnx.__version__, custom_conversion_functions=None, custom_shape_calculators=None):
+    if not utils.sparkml_installed():
+        raise RuntimeError('Spark is not installed. Please install Spark to use this feature.')
+
+    from .sparkml.convert import convert
+    return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
+                   custom_conversion_functions, custom_shape_calculators)
 
 def convert_xgboost(*args, **kwargs):
     if not utils.xgboost_installed():
@@ -72,4 +80,3 @@ def convert_xgboost(*args, **kwargs):
     from .xgboost.convert import convert
     return convert(*args, **kwargs)
 
-
 
@@ -0,0 +1,79 @@
+# Spark ML to Onnx Model Conversion
+
+As of this writing there are only 4 SparkML Transformers/Evaluators 
+are converted and for most of those only basic options are supported.
+
+There are prep work needed above and beyond calling the API. In short these steps are:
+
+* providing the API with the types of Tensors being input to the Session.
+* creating proper Tensors from the DataFrame you are going to use for prediction.
+* taking the output Tensor(s) and converting it(them) back to a DataFrame if further processing is required.
+
+## Instructions
+For examples, please see the unit tests under `test/sparkml`
+
+1- Create a list of input types needed to be supplied to the model conversion call.
+For simple cases you can use `buildInitialTypesSimple()` function in `convert/sparkml/utils.py`.
+To use this function just pass your test DataFrame.
+
+   Otherwise, the conversion code requires a list of tuples of input name and its Tensor type such as:
+```python
+initial_types = [ 
+    ("label", StringTensorType([1, 1])),
+    # (repeat for the required inputs)
+]
+```
+Note that the input names are the same as columns names from your DataFrame and they must match the "inputCol(s)" values
+
+you provided when you created your Pipeline.
+
+2- Now you can create the ONNX model from your pipeline model like so:
+```python
+pipeline_model = pipeline.fit(training_data)
+onnx_model = convert_sparkml(pipeline_model, 'My Sparkml Pipeline', initial_types)
+```
+
+3- (optional) You could save the ONNX model for future use or further examination by using the `SerializeToString()` 
+method of ONNX model
+
+```python
+with open("model.onnx", "wb") as f:
+    f.write(onnx_model.SerializeToString())
+```
+
+4- Before running this model (e.g. using `onnxruntime`) you need to create a `dict` from the input data. This dictionay
+ will have entries for each input name and its corresponding TensorData. For simple cases you could use the function
+`buildInputDictSimple()` and pass your testing DataFrame to it. Otherwise, you need to create something like the following:
+
+```python
+input_data = {}
+input_data['label'] = test_df.select('label').toPandas().values
+# ... (repeat for all desired inputs)
+```
+
+
+5- (optional) You could save the converted input data for possible debugging or future reuse. See below:
+```python
+with open("input_data", "wb") as f:
+    pickle.dump(input, f)
+```
+
+6- And finally run the newly converted ONNX model in the runtime:
+```python
+sess = onnxruntime.InferenceSession(onnx_model)
+output = sess.run(None, input_data)
+
+``` 
+ This output may need further conversion back to a DataFrame.
+
+
+## Known Issues
+
+1. StringIndexer must not drop any records: StringIndexer in Spark has a `handleInvalid` option.
+Do not set this to 'drop'.
+
+2. OneHotEncoderEstimator must not drop the last bit: OneHotEncoderEstimator has an option
+which you can use to make sure the last bit is included in the vector: `dropLast=False`
+
+3. Use FloatTensorType for all numbers (instead of Int6t4Tensor or other variations)
+
@@ -0,0 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+from .convert import convert
+from .utils import *