|
| 1 | +""" |
| 2 | +.. _l-plot-failing-model-extract: |
| 3 | +
|
| 4 | +Find where a model failing by running submodels |
| 5 | +=============================================== |
| 6 | +
|
| 7 | +Let's assume :epkg:`onnxruntime` crashes without telling why or where. |
| 8 | +The first thing is do is to locate where. For that, we extract every submodel |
| 9 | +starting from the inputs and running the first *n* nodes of the model. |
| 10 | +The model is likely to fail for some *n*. Then the failing is known. |
| 11 | +
|
| 12 | +A failing model |
| 13 | ++++++++++++++++ |
| 14 | +
|
| 15 | +The issue here is a an operator ``Cast`` trying to convert a result |
| 16 | +into a non-existing type. |
| 17 | +""" |
| 18 | + |
| 19 | +import numpy as np |
| 20 | +import onnx |
| 21 | +import onnx.helper as oh |
| 22 | +import onnxruntime |
| 23 | +from onnx_diagnostic.helpers import from_array_extended |
| 24 | +from onnx_diagnostic.ort_session import investigate_onnxruntime_issue |
| 25 | + |
| 26 | +TFLOAT = onnx.TensorProto.FLOAT |
| 27 | + |
| 28 | +model = oh.make_model( |
| 29 | + oh.make_graph( |
| 30 | + [ |
| 31 | + oh.make_node("Mul", ["X", "Y"], ["xy"], name="n0"), |
| 32 | + oh.make_node("Sigmoid", ["xy"], ["sy"], name="n1"), |
| 33 | + oh.make_node("Add", ["sy", "one"], ["C"], name="n2"), |
| 34 | + oh.make_node("Cast", ["C"], ["X999"], to=999, name="failing"), |
| 35 | + oh.make_node("CastLike", ["X999", "Y"], ["Z"], name="n4"), |
| 36 | + ], |
| 37 | + "nd", |
| 38 | + [ |
| 39 | + oh.make_tensor_value_info("X", TFLOAT, ["a", "b", "c"]), |
| 40 | + oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]), |
| 41 | + ], |
| 42 | + [oh.make_tensor_value_info("Z", TFLOAT, ["a", "b", "c"])], |
| 43 | + [from_array_extended(np.array([1], dtype=np.float32), name="one")], |
| 44 | + ), |
| 45 | + opset_imports=[oh.make_opsetid("", 18)], |
| 46 | + ir_version=9, |
| 47 | +) |
| 48 | + |
| 49 | +# %% |
| 50 | +# We check it is failing. |
| 51 | + |
| 52 | +try: |
| 53 | + onnxruntime.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"]) |
| 54 | +except onnxruntime.capi.onnxruntime_pybind11_state.Fail as e: |
| 55 | + print(e) |
| 56 | + |
| 57 | + |
| 58 | +# %% |
| 59 | +# Shape Inference |
| 60 | +# +++++++++++++++ |
| 61 | +# |
| 62 | +# Building submodels requires to known the output type. |
| 63 | +# We run shape inference on the model. |
| 64 | +shaped_model = onnx.shape_inference.infer_shapes(model) |
| 65 | + |
| 66 | + |
| 67 | +# %% |
| 68 | +# Looping over the nodes |
| 69 | +# ++++++++++++++++++++++ |
| 70 | +# |
| 71 | +# |
| 72 | + |
| 73 | +failing = investigate_onnxruntime_issue(shaped_model, providers="cpu", verbose=1, quiet=True) |
| 74 | + |
| 75 | +# %% |
| 76 | +# Let's print the failing node. |
| 77 | +print(failing) |
| 78 | + |
| 79 | + |
| 80 | +# %% |
| 81 | +# Detect an issue with shape Inference |
| 82 | +# ++++++++++++++++++++++++++++++++++++ |
| 83 | +# |
| 84 | +# We could have caught the error sooner by asking shape inference |
| 85 | +# to raise an exception if one node could not be processed. |
| 86 | +# It means either the node is a custom node |
| 87 | +# and shape inference has no way to guess the output type and shape |
| 88 | +# for this node or shape inference failed. |
| 89 | + |
| 90 | +try: |
| 91 | + onnx.shape_inference.infer_shapes(model, strict_mode=True) |
| 92 | +except onnx.onnx_cpp2py_export.shape_inference.InferenceError as e: |
| 93 | + print(e) |
0 commit comments