feat: extract onnx info

jlwalke2 · jlwalke2 · commit 91f9b15fe5c3 · 2024-08-28T14:08:24.000-04:00
diff --git a/src/sasctl/utils/model_info.py b/src/sasctl/utils/model_info.py
@@ -4,6 +4,7 @@
 # Copyright © 2023, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Union
 
@@ -17,6 +18,9 @@
 
 try:
     import onnx
+
+    # ONNX serializes models using protobuf, so this should be safe
+    from google.protobuf import json_format
 except ImportError:
     onnx = None
 
@@ -198,9 +202,6 @@ def __init__(self, model, X, y=None):
                 "The onnx package must be installed to work with ONNX models.  Please `pip install onnx`."
             )
 
-        # ONNX serializes models using protobuf, so this should be safe
-        from google.protobuf import json_format
-
         # TODO: size of X should match size of graph.input
 
         self._model = model
@@ -209,22 +210,18 @@ def __init__(self, model, X, y=None):
 
         inferred_model = onnx.shape_inference.infer_shapes(model)
 
-        inputs = [json_format.MessageToDict(i) for i in inferred_model.graph.input]
-        outputs = [json_format.MessageToDict(o) for o in inferred_model.graph.output]
+        inputs = [self._tensor_to_dataframe(i) for i in inferred_model.graph.input]
+        outputs = [self._tensor_to_dataframe(o) for o in inferred_model.graph.output]
 
         if len(inputs) > 1:
             pass # TODO: warn that only first input will be captured
 
         if len(outputs) > 1:
             pass # TODO: warn that only the first output will be captured
 
-        inputs[0]["type"]["tensorType"]["elemType"]
-        inputs[0]["type"]["tensorType"]["shape"]
+        self._X_df = inputs[0]
+        self._y_df = outputs[0]
 
-        self._properties = {
-            "description": model.doc_string,
-            "opset": model.opset_import
-        }
         # initializer (static params)
 
         # for field in model.ListFields():
@@ -243,45 +240,76 @@ def __init__(self, model, X, y=None):
         # producerVersion
         # opsetImport
 
-
         # # list of (FieldDescriptor, value)
         # fields = model.ListFields()
-        # inferred_model = onnx.shape_inference.infer_shapes(model)
-        #
-        # inputs = model.graph.input
-        # assert len(inputs) == 1
-        # i = inputs[0]
-        # print(i.name)
-        # print(i.type)
-        # print(i.type.tensor_type.shape)
+
+    @staticmethod
+    def _tensor_to_dataframe(tensor):
+        """
+
+        Parameters
+        ----------
+        tensor : onnx.onnx_ml_pb2.ValueInfoProto or dict
+            A protobuf `Message` containing information
+
+        Returns
+        -------
+        pandas.DataFrame
+
+        Examples
+        --------
+        df = _tensor_to_dataframe(model.graph.input[0])
+
+        """
+        if isinstance(tensor, onnx.onnx_ml_pb2.ValueInfoProto):
+            tensor = json_format.MessageToDict(tensor)
+        elif not isinstance(tensor,  dict):
+            raise ValueError(f"Unexpected type {type(tensor)}.")
+
+        name = tensor.get("name", "Var")
+        type_ = tensor["type"]
+
+        if not "tensorType" in type_:
+            raise ValueError(f"Received an unexpected ONNX input type: {type_}.")
+
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(type_["tensorType"]["elemType"])
+
+        # Tuple of tensor dimensions e.g. (1, 1, 24)
+        input_dims = tuple(int(d["dimValue"]) for d in type_["tensorType"]["shape"]["dim"])
+
+        return pd.DataFrame(dtype=dtype, columns=[f"{name}{i+1}" for i in range(math.prod(input_dims))])
 
     @property
     def algorithm(self) -> str:
         return "neural network"
 
+    @property
+    def description(self) -> str:
+        return self.model.doc_string
+
     @property
     def is_binary_classifier(self) -> bool:
-        return False
+        return len(self.output_column_names) == 2
 
     @property
     def is_classifier(self) -> bool:
-        return False
+        return len(self.output_column_names) > 1
 
     @property
     def is_clusterer(self) -> bool:
         return False
 
     @property
     def is_regressor(self) -> bool:
-        return False
+        return len(self.output_column_names) == 1
 
     @property
     def model(self) -> object:
         return self._model
 
     @property
     def model_params(self) -> Dict[str, Any]:
-        return {}
+        return {k: getattr(self.model, k, None) for k in ("ir_version", "model_version", "opset_import", "producer_name", "producer_version")}
 
     @property
     def predict_function(self) -> Callable:
@@ -300,12 +328,12 @@ def threshold(self) -> Union[str, None]:
         return None
 
     @property
-    def X(self):
-        return self._X
+    def X(self) -> pd.DataFrame:
+        return self._X_df
 
     @property
-    def y(self):
-        return self._y
+    def y(self) -> pd.DataFrame:
+        return self._y_df
 
 
 class PyTorchModelInfo(ModelInfo):
diff --git a/tests/unit/test_model_info_onnx.py b/tests/unit/test_model_info_onnx.py
@@ -1,10 +1,10 @@
+import pandas as pd
 import pytest
 
-import sasctl.utils.model_info
-
 onnx = pytest.importorskip("onnx")
 torch = pytest.importorskip("torch")
 
+import sasctl.utils.model_info
 from sasctl.utils import get_model_info
 
 # mnist
@@ -41,4 +41,15 @@ def forward(self, x):
 def test_get_info(mnist_model):
     info = get_model_info(*mnist_model)
     assert isinstance(info, sasctl.utils.model_info.OnnxModelInfo)
-    print(mnist_model)
+
+    # Output be classification into 10 digits
+    assert len(info.output_column_names) == 10
+    assert all(c.startswith("digit") for c in info.output_column_names)
+
+    assert isinstance(info.X, pd.DataFrame)
+    assert len(info.X.columns) == 28 * 28
+
+    assert info.is_classifier
+    assert not info.is_binary_classifier
+    assert not info.is_regressor
+    assert not info.is_clusterer