fix: resolving SDPA lowering position issue (output mismatch)

chohk88 · chohk88 · commit 9636ea6dd6ca · 2025-08-20T11:08:56.000Z
diff --git a/tools/llm/README.md b/tools/llm/README.md
@@ -74,11 +74,14 @@ This codebase can be extended to
 
 ## Limitations
 - We do not currently support sliding window attention (used in Gemma3 and Qwen 3 models) yet.
+- **Flash Attention Limitation**: Some models (e.g., Eagle2-2B) internally use flash attention operations (`torch.ops.flash_attn._flash_attn_forward.default`) which require the `flash-attn` package to be installed. Without flash-attn, these models will fail to load or run properly.
 
 ## Requirements
 
 - Torch-TensorRT 2.8.0
 - Transformers v4.52.3
 - For VLM models (run_vlm.py):
   - `pip install qwen-vl-utils` (for Qwen2.5-VL-3B-Instruct model)
-  - `pip install flash-attn --no-build-isolation -v` (for Eagle2-2B model)
+  - **Flash Attention**: For models using flash attention operations (e.g., Eagle2-2B), install one of the following:
+    - **Fast installation (recommended)**: `pip install flash-attn==2.8.1` (pre-built wheel, should work)
+    - **Source build (slow)**: `pip install flash-attn --no-build-isolation -v` (fallback if pre-built wheels fail)
diff --git a/tools/llm/run_vlm.py b/tools/llm/run_vlm.py
@@ -37,15 +37,10 @@
 import requests
 import torch
 import torch_tensorrt
-
-# we "monkey-patch" the global attention function map for Qwen2.
-# This ensures that any part of the code (including torch.export) requesting
-# "flash_attention_2" will receive the "sdpa" implementation instead.
-# This patch is global for the script's execution context.
-import transformers.models.qwen2.modeling_qwen2 as mq
 from PIL import Image
-from torchtrt_ext import register_sdpa
-from transformers import AutoConfig, AutoModel, AutoProcessor
+from transformers import AutoModel, AutoProcessor
+from transformers.models.qwen2 import modeling_qwen2 as mq
+from transformers.models.siglip import modeling_siglip as ms
 from utils import (
     export_llm,
     generate_mm,
@@ -59,8 +54,7 @@
 # Eagle2's language model (Qwen2) implicitly defaults to "flash_attention_2"
 # due to settings in its remote code and config.json. This prevents direct
 # compilation with SDPA. To work around this without modifying the library,
-
-
+ms.ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = ms.ALL_ATTENTION_FUNCTIONS["sdpa"]
 mq.ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = mq.ALL_ATTENTION_FUNCTIONS["sdpa"]
 # --- END WORKAROUND ---
 
@@ -259,8 +253,6 @@ def _compile_lm(
     seq_len = torch.export.Dim("seq", min=1, max=max_seq_len)
     position_ids = torch.arange(input_embeds.shape[1]).unsqueeze(0).to(device)
 
-    dyn_shapes = {"inputs_embeds": {1: seq_len}, "position_ids": {1: seq_len}}
-
     use_fp32_acc = False
     use_explicit_typing = False
     if args.precision == "FP16":
@@ -594,6 +586,9 @@ def print_outputs(backend_name: str, gen_tokens: torch.Tensor, tokenizer):
     # -------------------------------------------------------------------------#
     # Register static cache lowering passes if requested
     # Cache is not applied to vision model.
+    print("--- Registering SDPA lowering pass locally for LM compilation ---")
+    from torchtrt_ext import register_sdpa
+
     if args.cache == "static_v1":
         import static_cache_v1  # noqa: F401
     elif args.cache not in ("", None):
diff --git a/tools/llm/static_cache_v1.py b/tools/llm/static_cache_v1.py
@@ -39,12 +39,12 @@ def add_kv_as_outputs(gm, kv_cache_for_graph: List[Tuple[torch.Tensor, torch.Ten
     # Get the current output args (typically a tuple)
     current_outputs = output_node.args[0]
 
-    # If the current output is a tuple, extend it with our new outputs
-    if isinstance(current_outputs, tuple):
-        new_outputs = current_outputs + tuple(kv_cache_for_graph)
-    else:
-        # If there's only one output or it's not a tuple, create a new tuple
-        new_outputs = (current_outputs,) + tuple(kv_cache_for_graph)
+    # Ensure the original output is always treated as a tuple to avoid ambiguity
+    if not isinstance(current_outputs, tuple):
+        current_outputs = (current_outputs,)
+
+    # Extend the tuple with our new outputs
+    new_outputs = current_outputs + tuple(kv_cache_for_graph)
 
     gm.graph.output(new_outputs)
     gm.graph.erase_node(output_node)
@@ -98,7 +98,7 @@ def get_static_tensor(tensor: torch.Tensor):
     start_idx_input = _add_graph_input(gm, "start_idx", torch.tensor(0))
     end_idx_input = _add_graph_input(gm, "end_idx", torch.tensor(1))
 
-    # Get the max sequence length from the first key_cache node. The order of nodes is: input_ids, is_causal, key_cache1, value_cache1, key_cache2, value_cache2, ..
+    # Get the max sequence length from the first key_cache node. The order of nodes is: input_ids, position_ids, key_cache1, value_cache1, ...
     input_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
     input_ids_meta = input_nodes[0].meta["val"]
     seq_len = input_ids_meta.shape[1]
diff --git a/tools/llm/utils.py b/tools/llm/utils.py
@@ -47,7 +47,45 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16):
     return ep
 
 
-def get_zeroed_static_cache_inputs(model: torch.fx.GraphModule, device: str = "cuda:0"):
+def export_llm_no_position_ids(model, inputs, min_seq_len=1, max_seq_len=16):
+    """
+    Exports the LLM model into an ExportedProgram with dynamic shapes.
+    In the case of guard failures due to some PyTorch kernel implements, we also
+    try to re-export the graph by expressing them as runtime assert nodes
+    """
+    with torch.no_grad():
+        # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604
+        seq_len = torch.export.Dim("seq_len", min=min_seq_len, max=max_seq_len)
+        try:
+            print("Trying to export the model using torch.export.export()..")
+            # strict=False only enables aotautograd tracing and excludes dynamo.
+            ep = torch.export.export(
+                model,
+                args=(inputs,),
+                dynamic_shapes=({1: seq_len},),
+                strict=False,
+            )
+        except:
+            print(
+                "Trying torch.export._trace._export to trace the graph since torch.export.export() failed"
+            )
+            # This API is used to express the constraint violation guards as asserts in the graph.
+            ep = torch.export._trace._export(
+                model,
+                args=(inputs,),
+                dynamic_shapes=({1: seq_len},),
+                strict=False,
+                allow_complex_guards_as_runtime_asserts=True,
+            )
+
+    return ep
+
+
+def get_zeroed_static_cache_inputs(
+    model: "torch.fx.GraphModule",
+    device: str = "cuda:0",
+    has_position_ids: bool = True,
+):
     """
     Extracts and returns zeroed static KV cache tensors from a torch.fx.GraphModule. This should only be used for static cache_v1 and static cache_v2.
 
@@ -56,15 +94,26 @@ def get_zeroed_static_cache_inputs(model: torch.fx.GraphModule, device: str = "c
 
     Args:
         model (torch.fx.GraphModule): The exported model graph containing KV cache placeholders
+        device (str): Device to create the zeroed tensors on.
+        has_position_ids (bool): Whether position_ids is present as an input. Default: True
 
     Returns:
         tuple: A tuple of zeroed tensors corresponding to the KV cache placeholders in the graph
     """
     # placeholder nodes are expected to be in the following order:
-    # input_ids, kv_cache_key, kv_cache_value, start_idx, end_idx
+    # input_ids, position_ids, kv_cache_key, kv_cache_value, ..., start_idx, end_idx
     placeholder_nodes = [node for node in model.graph.nodes if node.op == "placeholder"]
-    # The first two inputs are input_ids, position_ids. The last two inputs are start_idx, end_idx. In between are the KV cache tensors.
-    kv_cache_inputs = placeholder_nodes[2:-2]
+
+    # By default, assume input_ids and position_ids are present as the first two inputs.
+    # If has_position_ids is False, only input_ids is present.
+    if has_position_ids:
+        kv_start = 2
+    else:
+        kv_start = 1
+    # The last two inputs are start_idx, end_idx.
+    kv_end = -2
+
+    kv_cache_inputs = placeholder_nodes[kv_start:kv_end]
     zeroed_kv_cache_inputs = []
     for input in kv_cache_inputs:
         zeroed_kv_cache_inputs.append(
@@ -458,7 +507,9 @@ def generate_mm_with_static_cache(
         )
 
     # ───────────────────── KV-cache initialization ─────────────────────
-    kv_cache = get_zeroed_static_cache_inputs(model.language_model, device=device)
+    kv_cache = get_zeroed_static_cache_inputs(
+        model.language_model, device=device, has_position_ids=True
+    )
     start_idx = 0
     end_idx = seq_embeds.size(1)
     generated = 0
@@ -710,7 +761,9 @@ def generate_mm_qwen2_5_vl_with_static_cache(
             with_timing=False,
         )
 
-    kv_cache = get_zeroed_static_cache_inputs(model.model, device=device)
+    kv_cache = get_zeroed_static_cache_inputs(
+        model.model, device=device, has_position_ids=True
+    )
     start_idx = 0
     end_idx = seq_embeds.size(1)
     generated = 0