fix some issues

xadupre · xadupre · commit e89379d57ec1 · 2025-10-28T18:49:35.000+01:00
diff --git a/_unittests/ut_helpers/test_rt_helper.py b/_unittests/ut_helpers/test_rt_helper.py
@@ -1,9 +1,14 @@
 import os
 import unittest
 import torch
-from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    hide_stdout,
+    requires_transformers,
+    requires_torch,
+)
 from onnx_diagnostic.helpers import max_diff, flatten_object
-from onnx_diagnostic.helpers.rt_helper import onnx_generate
+from onnx_diagnostic.helpers.rt_helper import onnx_generate, make_empty_cache
 from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
 from onnx_diagnostic.helpers.ort_session import InferenceSessionForTorch
 from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
@@ -21,16 +26,33 @@ def simple_generate_with_cache(
         max_new_tokens: int = 100,
     ):
         # First call: prefill
-        outputs = model(
-            input_ids,
-            use_cache=True,
-            attention_mask=torch.ones(
-                input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+        attention_mask = torch.ones(
+            input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+        )
+        feeds = {
+            **dict(zip(session.input_names[:2], [input_ids, attention_mask])),
+            **make_empty_cache(
+                input_ids.shape[0],
+                session.input_names[2:],
+                session.input_shapes[2:],
+                session.input_types[2:],
             ),
+        }
+        onnx_results = session.run(None, feeds)
+
+        outputs = model(input_ids, use_cache=True, attention_mask=attention_mask)
+
+        diff = max_diff(outputs, onnx_results)
+        assert diff["abs"] <= 0.1, (
+            f"Unexpected issue with {type(model)}\ndiff={diff}"
+            f"\ninput_ids.shape={input_ids.shape}"
+            f"\nexpected={self.string_type(outputs, with_shape=True, with_min_max=True)}"
+            f"\n     got=\n"
+            f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}"
         )
 
         # Next calls: decode
-        for _ in range(max_new_tokens):
+        for iteration in range(max_new_tokens):
             next_token_logits = outputs.logits[:, -1, :]
             next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
             if next_token_id.item() == eos_token_id:
@@ -42,11 +64,14 @@ def simple_generate_with_cache(
             feeds = dict(
                 zip(
                     session.input_names,
-                    torch_deepcopy(
-                        flatten_object(
-                            [next_token_id, attention_mask, outputs.past_key_values]
+                    [
+                        t.detach()
+                        for t in torch_deepcopy(
+                            flatten_object(
+                                [next_token_id, attention_mask, outputs.past_key_values]
+                            )
                         )
-                    ),
+                    ],
                 )
             )
             onnx_results = session.run(None, feeds)
@@ -57,9 +82,17 @@ def simple_generate_with_cache(
                 attention_mask=attention_mask,
             )
             diff = max_diff(outputs, onnx_results)
-            print("****", diff)
+            assert diff["abs"] <= 0.1, (
+                f"Unexpected issue with {type(model)}, iteration={iteration}"
+                f"\ndiff={diff}\ninput_ids.shape={input_ids.shape}"
+                f"\nexpected={self.string_type(outputs, with_shape=True, with_min_max=True)}"
+                f"\n     got=\n"
+                f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}"
+            )
         return input_ids
 
+    @requires_transformers("4.55")
+    @requires_torch("2.9")
     @hide_stdout()
     def test_onnx_generate(self):
         mid = "arnir0/Tiny-LLM"
@@ -83,25 +116,25 @@ def test_onnx_generate(self):
                 exporter="custom",
             )
 
-        print("-- test_onnx_generate: generate")
-        res, session = onnx_generate(
-            model_name, input_ids[:1], 2, max_new_tokens=10, return_session=True
-        )
-        n_inputs = input_ids.shape[1]
-        self.assertEqualArray(input_ids[:1], res[:, :n_inputs])
-        self.assertEqual(res.dtype, torch.int64)
-        self.assertEqual(res.shape, (1, 13))
-        print("-- test_onnx_generate: done")
-        # expected = model.generate(input_ids[:1], max_new_tokens=10)
-        expected = self.simple_generate_with_cache(
-            model, input_ids[:1], 2, max_new_tokens=10, session=session
-        )
-        self.assertEqualArray(input_ids[:1], expected[:, :n_inputs])
-        print("******", res)
-        print("******", expected)
-        self.assertEqual(expected.dtype, torch.int64)
-        self.assertEqual(expected.shape, (1, 13))
-        self.assertEqualArray(expected, res)
+            print("-- test_onnx_generate: generate")
+            res, session = onnx_generate(
+                model_name, input_ids[:1], 2, max_new_tokens=10, return_session=True
+            )
+            n_inputs = input_ids.shape[1]
+            self.assertEqualArray(input_ids[:1], res[:, :n_inputs])
+            self.assertEqual(res.dtype, torch.int64)
+            self.assertEqual(res.shape, (1, 13))
+            print("-- test_onnx_generate: done")
+            # expected = model.generate(input_ids[:1], max_new_tokens=10)
+            expected = self.simple_generate_with_cache(
+                model, input_ids[:1], 2, max_new_tokens=10, session=session
+            )
+            self.assertEqualArray(input_ids[:1], expected[:, :n_inputs])
+            print("******", res)
+            print("******", expected)
+            self.assertEqual(expected.dtype, torch.int64)
+            self.assertEqual(expected.shape, (1, 13))
+            self.assertEqualArray(expected, res)
 
 
 if __name__ == "__main__":
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
@@ -1057,6 +1057,20 @@ def max_diff(
             allow_unique_tensor_with_list_of_one_element=False,
             hist=hist,
         )
+
+    if expected.__class__.__name__ == "CausalLMOutputWithPast":
+        if verbose >= 6:
+            print(
+                f"[max_diff] CausalLMOutputWithPast: {string_type(expected)} "
+                f"? {string_type(got)}"
+            )
+        return max_diff(
+            [expected.logits, *flatten_object(expected.past_key_values)],
+            got,
+            debug_info=_debug(expected.__class__.__name__),
+            **_dkws,
+        )
+
     if hasattr(expected, "to_tuple"):
         if verbose >= 6:
             print(f"[max_diff] to_tuple1: {string_type(expected)} ? {string_type(got)}")
@@ -1067,36 +1081,6 @@ def max_diff(
             print(f"[max_diff] to_tuple2: {string_type(expected)} ? {string_type(got)}")
         return max_diff(expected, got.to_tuple(), debug_info=_debug("to_tuple2"), **_dkws)
 
-        if isinstance(got, (list, tuple)):
-            if len(got) != 1:
-                if verbose >= 6:
-                    print(
-                        f"[max_diff] list,tuple,2: {string_type(expected)} "
-                        f"? {string_type(got)}"
-                    )
-                if verbose > 2:
-                    import torch
-
-                    print(
-                        f"[max_diff] (a) inf because len(expected)={len(expected)}!=1, "
-                        f"len(got)={len(got)}, level={level}, _index={_index}"
-                    )
-                    for i, (a, b) in enumerate(zip(expected, got)):
-                        if isinstance(a, torch.Tensor) and isinstance(b, torch.Tensor):
-                            print(
-                                f"    i={i} expected {a.dtype}:{a.shape}, "
-                                f"has {b.dtype}:{b.shape}, _index={_index}"
-                            )
-                        else:
-                            print(
-                                f"    i={i} a is {type(a)}, "
-                                f"b is {type(b)}, _index={_index}"
-                            )
-                return dict(abs=np.inf, rel=np.inf, sum=np.inf, n=np.inf, dnan=np.inf)
-            if verbose >= 6:
-                print(f"[max_diff] list,tuple,1: {string_type(expected)} ? {string_type(got)}")
-            return max_diff(expected, got[0], debug_info=_debug("lt1"), **_dkws)
-
     if isinstance(expected, (tuple, list)):
         if verbose >= 6:
             print(f"[max_diff] list,tuple,0: {string_type(expected)} ? {string_type(got)}")
@@ -1485,7 +1469,7 @@ def max_diff(
             return dict(abs=np.inf, rel=np.inf, sum=np.inf, n=np.inf, dnan=np.inf)
         if verbose >= 6:
             print(
-                f"[max_diff] {expected.__class__.__name__}: "
+                f"[max_diff*] {expected.__class__.__name__}: "
                 f"{string_type(expected)} ? {string_type(got)}"
             )
         expected_args, _spec = torch.utils._pytree.tree_flatten(expected)
diff --git a/onnx_diagnostic/helpers/rt_helper.py b/onnx_diagnostic/helpers/rt_helper.py
@@ -122,6 +122,31 @@ def rt_type_to_torch_dtype(typename: str) -> torch.dtype:
     return _DTYPES[typename]
 
 
+def make_empty_cache(
+    batch: int,
+    onnx_input_names: List[str],
+    onnx_input_shapes: List[Tuple[Union[int, str], ...]],
+    onnx_input_types: List[str],
+) -> Dict[str, torch.Tensor]:
+    """
+    Creates an empty cache. Example:
+
+    .. code-block:: python
+
+        make_empty_cache(
+            1,
+            sess.input_names[2:],
+            [i.shape for i in sess.get_inputs()[2:]],
+            [i.type for i in sess.get_inputs()[2:]],
+        )
+    """
+    feeds = {}
+    for name, shape, dtype in zip(onnx_input_names, onnx_input_shapes, onnx_input_types):
+        new_shape = tuple(_get_dim(i, s, batch=batch) for i, s in enumerate(shape))
+        feeds[name] = torch.empty(new_shape, dtype=rt_type_to_torch_dtype(dtype))
+    return feeds
+
+
 def onnx_generate(
     model_or_path: Union[onnx.ModelProto, str, InferenceSessionForTorch],
     input_ids: torch.Tensor,
@@ -166,12 +191,10 @@ def onnx_generate(
         attention_mask=torch.ones(
             input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
         ),
+        **make_empty_cache(
+            input_ids.shape[0], input_names[2:], input_shapes[2:], input_types[2:]
+        ),
     )
-    for name, shape, dtype in zip(input_names[2:], input_shapes[2:], input_types[2:]):
-        new_shape = tuple(
-            _get_dim(i, s, batch=input_ids.shape[0]) for i, s in enumerate(shape)
-        )
-        feeds[name] = torch.empty(new_shape, dtype=rt_type_to_torch_dtype(dtype))
 
     outputs = session.run(None, feeds)