sdpython · sdpython · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.8.3
 +++++
 
+* :pr:`322`: support rerunning onnx kernels with torch intermediate results in side-by-side
 * :pr:`314`: fix modelbuilder download needed after this change https://github.com/microsoft/onnxruntime-genai/pull/1862
 * :pr:`311`: use custom and local function to use PackedMultiHeadAttention from onnxruntime
 * :pr:`310`: splits patches into multiple files 

diff --git a/_doc/cmds/sbs.rst b/_doc/cmds/sbs.rst
@@ -20,3 +20,40 @@ CPU, CUDA
 
 Inputs are saved :func:`torch.save`. The execution will run on CUDA
 if the device of the inputs is CUDA, same goes on CPU.
+
+Example
++++++++
+
+.. code-block::
+
+    python -m onnx_diagnostic sbs \
+        -i qwen_2_5_vl_instruct_visual.inputs.pt \
+        --ep test_imagetext2text_qwen_2_5_vl_instruct_visual.cuda.float16.custom.graph.ep.pt2 \
+        -m test_imagetext2text_qwen_2_5_vl_instruct_visual.cuda.float16.custom.onnx \
+        -o results.dynamo.float16.xlsx \
+        -v 1 --atol=0.1 --rtol=1 \
+        --replay-names conv3d,rsqrt,to_4,mul_48,linear,linear_2,linear_84,linear_89,mul_172,linear_156,linear_159 \
+        -2 --reset conv3d
+
+A snippet of the table it produces:
+
+::
+
+    ep_name         onnx_name       ep_target               onnx_op_type            onnx_id_output   ep_shape_type      onnx_shape_type    err_abs 
+    transpose_18    transpose_18    aten.transpose.int      Transpose                           0    GT10s16x1292x80    GT10s16x1292x80    0.0083 
+    unsqueeze_50    unsqueeze_50    aten.unsqueeze.default  Unsqueeze                           0    GT10s1x16x1292x80  GT10s1x16x1292x80  0.0083 
+    eq_20           eq_20           aten.eq.Scalar          Equal                               0    GT9s1292x1292      GT9s1292x1292      0   
+    unsqueeze_56    unsqueeze_56    aten.unsqueeze.default  Unsqueeze                           0    GT9s1x1x1292x1292  GT9s1x1x1292x1292  0   
+    slice_29        slice_29        aten.slice.Tensor       Slice                               0    GT9s1x1x1292x1292  GT9s1x1x1292x1292  0   
+    transpose_19    transpose_19    aten.transpose.int      Transpose                           0    GT10s1x1292x16x80  GT10s1x1292x16x80  0.0071 
+    reshape_20      reshape_20      aten.reshape.default    Reshape                             0    GT10s1292x1280     GT10s1292x1280     0.0071 
+    linear_21       linear_21       aten.linear.default     Gemm                                0    GT10s1292x1280     GT10s1292x1280     0.0015 
+    mul_54          mul_54          aten.mul.Tensor         SkipSimplifiedLayerNormalization    0    GT10s1292x1280     GT10s1292x1280     0.0098 
+    add_32          add_32          aten.add.Tensor         SkipSimplifiedLayerNormalization    3    GT10s1292x1280     GT10s1292x1280     0.0313 
+    linear_22       linear_22       aten.linear.default     Gemm                                0    GT10s1292x3420     GT10s1292x3420     0.0078 
+    silu_4          silu_4          aten.silu.default       QuickGelu                           0    GT10s1292x3420     GT10s1292x3420     0.0059 
+
+The available column are described by
+:class:`RunAlignedRecord <onnx_diagnostic.torch_onnx.sbs_dataclasses.RunAlignedRecord>`.
+It is possible to dump pieces of the model to study some particular input
+with :class:`ReplayConfiguration <onnx_diagnostic.torch_onnx.sbs_dataclasses.ReplayConfiguration>`.
diff --git a/_unittests/ut_torch_onnx/test_sbs.py b/_unittests/ut_torch_onnx/test_sbs.py
@@ -379,7 +379,7 @@ def forward(self, x):
                 use_tensor=True,
             ),
         )
-        df = pandas.DataFrame(list(results))
+        df = pandas.DataFrame(list(results)).dropna(axis=1, how="all")
         df.to_excel(self.get_dump_file("test_sbs_model_with_weights_custom.xlsx"))
         self.assertEqual(
             [
@@ -390,8 +390,8 @@ def forward(self, x):
                 "ep_time_run",
                 "err_abs",
                 "err_dev",
+                "err_h001",
                 "err_h01",
-                "err_nan",
                 "err_rel",
                 "onnx_id_node",
                 "onnx_id_output",
@@ -445,7 +445,7 @@ def forward(self, x):
                 use_tensor=True,
             ),
         )
-        df = pandas.DataFrame(list(results))
+        df = pandas.DataFrame(list(results)).dropna(axis=1, how="all")
         df.to_excel(self.get_dump_file("test_sbs_model_with_weights_dynamo.xlsx"))
         self.assertEqual(
             [
@@ -456,8 +456,8 @@ def forward(self, x):
                 "ep_time_run",
                 "err_abs",
                 "err_dev",
+                "err_h001",
                 "err_h01",
-                "err_nan",
                 "err_rel",
                 "onnx_id_node",
                 "onnx_id_output",
@@ -542,7 +542,7 @@ def forward(self, x):
                 reset_names=["linear"],
             ),
         )
-        df = pandas.DataFrame(list(results))
+        df = pandas.DataFrame(list(results)).dropna(axis=1, how="all")
         df.to_excel(self.get_dump_file("test_sbs_model_with_weights_custom_reset.xlsx"))
         onnx_op_type = df["onnx_op_type"].tolist()
         self.assertEqual(onnx_op_type.count("reset"), 1)
@@ -593,10 +593,83 @@ def forward(self, x):
                 ),
             ),
         )
-        df = pandas.DataFrame(list(results))
+        df = pandas.DataFrame(list(results)).dropna(axis=1, how="all")
         df.to_excel(self.get_dump_file("test_sbs_replay.xlsx"))
-        print(df)
-        # self.clean_dump()
+        self.assertEqual(df.shape, (8, 16))
+        self.clean_dump()
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_run_onnx_with_torch_inputs(self):
+        torch = self.torch
+
+        class Model(self.torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.fc1 = torch.nn.Linear(10, 32)  # input size 10 → hidden size 32
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(32, 1)  # hidden → output
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.fc2(x)
+                return x
+
+        inputs = dict(x=self.torch.randn((5, 10)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        filename = self.get_dump_file("test_sbs_run_onnx_with_torch_inputs.onnx")
+        to_onnx(ep, exporter="custom", filename=filename)
+        onx = onnx.load(filename)
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                verbose=11,
+                use_tensor=True,
+                run_onnx_with_torch_inputs=True,
+            ),
+        )
+        df = pandas.DataFrame(list(results)).dropna(axis=1, how="all")
+        df.to_excel(self.get_dump_file("test_sbs_run_onnx_with_torch_inputs.xlsx"))
+        self.assertEqual(
+            [
+                "comment",
+                "ep_id_node",
+                "ep_name",
+                "ep_shape_type",
+                "ep_target",
+                "ep_time_run",
+                "err_abs",
+                "err_abs2",
+                "err_dev",
+                "err_dev2",
+                "err_h001",
+                "err_h0012",
+                "err_h01",
+                "err_h012",
+                "err_rel",
+                "err_rel2",
+                "onnx_id_node",
+                "onnx_id_output",
+                "onnx_name",
+                "onnx_op_type",
+                "onnx_shape_type",
+                "onnx_time_run",
+            ],
+            sorted(df.columns),
+        )
+        self.assertEqual(len(results), 8)
+        self.assertEqual([0, 0, 0, 0, None, 0, 0, 0], [r.err_dev for r in results])
+        self.assertEqual(
+            [-1, -1, -1, -1, -1, 0, 1, 2], df["onnx_id_node"].fillna(-10).tolist()
+        )
+        self.clean_dump()
 
 
 if __name__ == "__main__":

diff --git a/onnx_diagnostic/_command_lines_parser.py b/onnx_diagnostic/_command_lines_parser.py
@@ -1217,6 +1217,19 @@ def get_parser_sbs() -> ArgumentParser:
         default=False,
         help="First runs the whole model.",
     )
+    parser.add_argument(
+        "-2",
+        "--second-run",
+        action=BooleanOptionalAction,
+        default=False,
+        help=textwrap.dedent(
+            """
+            Tries to run all onnx nodes with torch results produced by the exported
+            program. It then measures the discrepancies again. It can be used
+            to identify kernel introduces discrepancies from other just propagating them.
+            """
+        ),
+    )
     parser.add_argument(
         "--reset",
         required=False,
@@ -1365,6 +1378,7 @@ def _size(name):
         reset_names=args.reset.split(","),
         exc=False,
         replay_configuration=replay_configuration,
+        run_onnx_with_torch_inputs=args.second_run,
     ):
         data.append(obs)
         if (
@@ -1377,8 +1391,10 @@ def _size(name):
             )
             df.to_excel(args.output)
     print(f"-- final saves into {args.output!r}")
-    df = pandas.DataFrame(data).apply(
-        lambda col: col.fillna("") if col.dtype == "object" else col
+    df = (
+        pandas.DataFrame(data)
+        .apply(lambda col: col.fillna("") if col.dtype == "object" else col)
+        .dropna(axis=1, how="all")
     )
     df.to_excel(args.output, index=False)
     print("-- done")