support prompt processing and token generation

titaiwangms · titaiwangms · commit d65493b3a48d · 2025-09-23T21:23:07.000Z
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
@@ -230,8 +230,11 @@ def get_inputs(
                     0: batch,
                     1: seq_length,
                 },
+                "past_key_values": [
+                    [{0: batch, 2: past_seq_length} for _ in range(num_hidden_layers)],
+                    [{0: batch, 2: past_seq_length} for _ in range(num_hidden_layers)],
+                ],
             }
-
             inputs = dict(
                 input_ids=torch.randint(
                     0, dummy_max_token_id, (batch_size, sequence_length)
@@ -244,10 +247,7 @@ def get_inputs(
                 )
                 .to(torch.int64)
                 .expand((batch_size, -1)),
-            )
-            # Caches are involved
-            if past_sequence_length > 0:
-                inputs["past_key_values"] = make_cache(
+                past_key_values=make_cache(
                     [
                         (
                             torch.randn(
@@ -259,11 +259,10 @@ def get_inputs(
                         )
                         for i in range(num_hidden_layers)
                     ]
-                )
-                shapes["past_key_values"] = [
-                    [{0: batch, 2: past_seq_length} for _ in range(num_hidden_layers)],
-                    [{0: batch, 2: past_seq_length} for _ in range(num_hidden_layers)],
-                ]
+                ),
+            )
+        # NOTE: past_sequence_length can be 0 when testing prompt processing,
+        # which it becomes an empty tensor
         res = dict(inputs=inputs, dynamic_shapes=shapes)
     if add_second_input:
         # prompt processing (prefill) testing
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1657,7 +1657,7 @@ def patched_sdpa_attention_forward(
         is_causal: Optional[bool] = None,
         **kwargs,
     ) -> tuple[torch.Tensor, None]:
-        """manual patch for function ```transformers.integrations.sdpa_attention.sdpa_attention_forward```."""
+        """manual patch for function ```transformers.integrations.sdpa_attention.sdpa_attention_forward```."""  # noqa: E501
         if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
             logger.warning_once(
                 "`sdpa` attention does not support `output_attentions=True` or `head_mask`."
@@ -1674,18 +1674,18 @@ def patched_sdpa_attention_forward(
         if attention_mask is not None and attention_mask.ndim == 4:
             attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        # Note that it is important to check first for the shape, otherwise compile will fail with `argument 'is_causal' must be bool, not SymBool`
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment  # noqa: E501
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.  # noqa: E501
+        # Note that it is important to check first for the shape, otherwise compile will fail with `argument 'is_causal' must be bool, not SymBool`  # noqa: E501
         if is_causal is None:
-            # The last condition is for encoder (decoder) models which specify this by passing their own `is_causal` flag
-            # This is mainly due to those models having mixed implementations for encoder, decoder, and encoder-decoder attns
-            # is_causal = query.shape[2] > 1 and attention_mask is None and getattr(module, "is_causal", True)
+            # The last condition is for encoder (decoder) models which specify this by passing their own `is_causal` flag  # noqa: E501
+            # This is mainly due to those models having mixed implementations for encoder, decoder, and encoder-decoder attns  # noqa: E501
+            # is_causal = query.shape[2] > 1 and attention_mask is None and getattr(module, "is_causal", True)  # noqa: E501
             # NOTE: query.shape[2] == 1 or > 1 should have the same output for causal attention
             # so we simplify the condition to:
             is_causal = attention_mask is None and getattr(module, "is_causal", True)
 
-        # Shapes (e.g. query.shape[2]) are tensors during jit tracing, resulting in `is_causal` being a tensor.
+        # Shapes (e.g. query.shape[2]) are tensors during jit tracing, resulting in `is_causal` being a tensor.  # noqa: E501
         # We convert it to a bool for the SDPA kernel that only accepts bools.
         if torch.jit.is_tracing() and isinstance(is_causal, torch.Tensor):
             is_causal = is_causal.item()