other fixes

xadupre · xadupre · commit 627a5963ad73 · 2025-10-28T14:55:44.000+01:00
diff --git a/_unittests/ut_export/test_shape_helper.py b/_unittests/ut_export/test_shape_helper.py
@@ -155,8 +155,8 @@ def test_all_dynamic_shapes_from_inputs_dynamic_cache(self):
                 "attention_mask": {0: "d_1_0", 1: "d_1_1"},
                 "position_ids": {0: "d_2_0", 1: "d_2_1"},
                 "past_key_values": [
-                    [{0: "d_3_0", 1: "d_3_1", 2: "d_3_2", 3: "d_3_3"}],
-                    [{0: "d_4_0", 1: "d_4_1", 2: "d_4_2", 3: "d_4_3"}],
+                    {0: "d_3_0", 1: "d_3_1", 2: "d_3_2", 3: "d_3_3"},
+                    {0: "d_4_0", 1: "d_4_1", 2: "d_4_2", 3: "d_4_3"},
                 ],
             },
             ds,
@@ -176,8 +176,8 @@ def test_guess_dynamic_shapes_from_inputs(self):
                     "attention_mask": {0: "dd_0I0", 1: "dd_0I1"},
                     "input_ids": {0: "dd_1I0", 1: "dd_1I1"},
                     "past_key_values": [
-                        [{0: "dd_2I_0o_0l0", 2: "dd_2I_0o_0l2"}],
-                        [{0: "dd_2I_1o_0l0", 2: "dd_2I_1o_0l2"}],
+                        {0: "dd_2I_0o_0l0", 2: "dd_2I_0o_0l2"},
+                        {0: "dd_2I_1o_0l0", 2: "dd_2I_1o_0l2"},
                     ],
                     "position_ids": {0: "dd_3I0", 1: "dd_3I1"},
                 },
diff --git a/_unittests/ut_helpers/test_rt_helper.py b/_unittests/ut_helpers/test_rt_helper.py
@@ -5,20 +5,50 @@
 from onnx_diagnostic.helpers.rt_helper import onnx_generate
 from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
+from onnx_diagnostic.export.api import to_onnx
 
 
 class TestRtSession(ExtTestCase):
+    def simple_generate_with_cache(
+        self, model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
+    ):
+        # First call: prefill
+        outputs = model(
+            input_ids,
+            attention_mask=torch.ones(
+                input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+            ),
+            use_cache=True,
+        )
+
+        # Next calls: decode
+        for _ in range(max_new_tokens):
+            next_token_logits = outputs.logits[:, -1, :]
+            past_key_values = outputs.past_key_values
+            next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+            if next_token_id.item() == eos_token_id:
+                break
+            input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+            outputs = model(
+                next_token_id,
+                use_cache=True,
+                past_key_values=past_key_values,
+                attention_mask=torch.ones(
+                    input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+                ),
+            )
+        return input_ids
+
     @hide_stdout()
     def test_onnx_generate(self):
-        from experimental_experiment.torch_interpreter import to_onnx
-
         mid = "arnir0/Tiny-LLM"
         print("-- test_onnx_generate: get model")
         data = get_untrained_model_with_inputs(mid)
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         del inputs["position_ids"]
         del ds["position_ids"]
         input_ids = inputs["input_ids"]
+        print("----", input_ids.shape)
         folder = self.get_dump_folder("test_onnx_generate")
         model_name = os.path.join(folder, "model.onnx")
         print("-- test_onnx_generate: export model")
@@ -29,13 +59,24 @@ def test_onnx_generate(self):
                 kwargs=inputs,
                 dynamic_shapes=ds,
                 filename=model_name,
+                exporter="custom",
             )
 
         print("-- test_onnx_generate: generate")
         res = onnx_generate(model_name, input_ids[:1], 2, max_new_tokens=10)
+        n_inputs = input_ids.shape[1]
+        self.assertEqualArray(input_ids[:1], res[:, :n_inputs])
         self.assertEqual(res.dtype, torch.int64)
         self.assertEqual(res.shape, (1, 13))
         print("-- test_onnx_generate: done")
+        # expected = model.generate(input_ids[:1], max_new_tokens=10)
+        expected = self.simple_generate_with_cache(model, input_ids[:1], 2, max_new_tokens=10)
+        self.assertEqualArray(input_ids[:1], expected[:, :n_inputs])
+        print("******", res)
+        print("******", expected)
+        self.assertEqual(expected.dtype, torch.int64)
+        self.assertEqual(expected.shape, (1, 13))
+        self.assertEqualArray(expected, res)
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
@@ -113,8 +113,8 @@ def test_automatic_speech_recognition_float32(self):
                 "cache_position": {0: "seq_length"},
                 "encoder_outputs": [{0: "batch"}],
                 "past_key_values": [
-                    [[{0: "batch"}, {0: "batch"}], [{0: "batch"}, {0: "batch"}]],
-                    [[{0: "batch"}, {0: "batch"}], [{0: "batch"}, {0: "batch"}]],
+                    [{0: "batch"}, {0: "batch"}, {0: "batch"}, {0: "batch"}],
+                    [{0: "batch"}, {0: "batch"}, {0: "batch"}, {0: "batch"}],
                 ],
             },
             ds,
diff --git a/_unittests/ut_torch_export_patches/test_patch_inputs.py b/_unittests/ut_torch_export_patches/test_patch_inputs.py
@@ -48,8 +48,8 @@ def test_convert_dynamic_axes_into_dynamic_shapes_1(self):
                 "attention_mask": {0: "batch_size", 1: "total_sequence_length"},
                 "input_ids": {0: "batch_size", 1: "sequence_length"},
                 "past_key_values": [
-                    [{0: "batch_size", 2: "past_sequence_length"}],
-                    [{0: "batch_size", 2: "past_sequence_length"}],
+                    {0: "batch_size", 2: "past_sequence_length"},
+                    {0: "batch_size", 2: "past_sequence_length"},
                 ],
                 "position_ids": {0: "batch_size", 1: "sequence_length"},
             },
@@ -98,8 +98,8 @@ def test_convert_dynamic_axes_into_dynamic_shapes_2(self):
                 "attention_mask": {0: "batch_size", 1: "sequence_length"},
                 "input_ids": {0: "batch_size", 1: "sequence_length"},
                 "past_key_values": [
-                    [{0: "batch_size", 2: "past_sequence_length"}],
-                    [{0: "batch_size", 2: "past_sequence_length"}],
+                    {0: "batch_size", 2: "past_sequence_length"},
+                    {0: "batch_size", 2: "past_sequence_length"},
                 ],
                 "position_ids": {0: "batch_size", 1: "sequence_length"},
             },
diff --git a/onnx_diagnostic/export/shape_helper.py b/onnx_diagnostic/export/shape_helper.py
@@ -260,8 +260,10 @@ def make_fake_with_dynamic_dimensions(
                 "attention_mask": {0: "batch", 1: "cache+seq"},
                 "position_ids": {0: "batch", 1: "seq_length"},
                 "past_key_values": [
-                    [{0: "batch", 2: "cache_length"}, {0: "batch", 2: "cache_length"}],
-                    [{0: "batch", 2: "cache_length"}, {0: "batch", 2: "cache_length"}],
+                    {0: "batch", 2: "cache_length"},
+                    {0: "batch", 2: "cache_length"},
+                    {0: "batch", 2: "cache_length"},
+                    {0: "batch", 2: "cache_length"},
                 ],
             },
         )
diff --git a/onnx_diagnostic/helpers/rt_helper.py b/onnx_diagnostic/helpers/rt_helper.py
@@ -206,7 +206,7 @@ def onnx_generate(
     ), f"Only text generation is supported but input_names == {input_names}"
 
     # First call: prefill
-    input_feeds = dict(
+    feeds = dict(
         input_ids=input_ids,
         attention_mask=torch.ones(
             input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
@@ -216,9 +216,9 @@ def onnx_generate(
         new_shape = tuple(
             _get_dim(i, s, batch=input_ids.shape[0]) for i, s in enumerate(shape)
         )
-        input_feeds[name] = torch.empty(new_shape, dtype=rt_type_to_torch_dtype(dtype))
+        feeds[name] = torch.empty(new_shape, dtype=rt_type_to_torch_dtype(dtype))
 
-    outputs = session.run(None, input_feeds)
+    outputs = session.run(None, feeds)
 
     # Next calls: decode
     for _ in range(max_new_tokens):
@@ -241,7 +241,7 @@ def onnx_generate(
             ),
         )
         feeds.update(dict(zip(input_names[2:], outputs[1:])))
-        outputs = session.run(None, input_feeds)
+        outputs = session.run(None, feeds)
 
     if return_session:
         return input_ids, session
diff --git a/onnx_diagnostic/torch_models/untrained/llm_phi2.py b/onnx_diagnostic/torch_models/untrained/llm_phi2.py
@@ -84,10 +84,7 @@ def get_phi2(
             0: batch,
             1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
         },
-        "past_key_values": [
-            [{0: batch, 2: cache_length} for _ in range(n_layers)],
-            [{0: batch, 2: cache_length} for _ in range(n_layers)],
-        ],
+        "past_key_values": [{0: batch, 2: cache_length} for _ in range(n_layers * 2)],
     }
     inputs = dict(
         input_ids=torch.randint(0, max_token_id, (batch_size, sequence_length2)).to(