add register for different sdpa

lanluo-nvidia · lanluo-nvidia · commit 6a1a02e9c884 · 2025-08-29T15:54:43.000-07:00
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -58,7 +58,10 @@ def get_model(args):
             .eval()
             .cuda()
         )
-        register_sdpa.register_sdpa_pass_with_model_config(model_config=model.config)
+        if register_sdpa._SDPA_MAPPING.get(args.model, None) is not None:
+            register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
+        else:
+            register_sdpa._SDPA_MAPPING["default"](model_config=model.config)
 
     if args.precision == "FP16":
         model = model.to(torch.float16)
diff --git a/tools/llm/torchtrt_ext/register_sdpa.py b/tools/llm/torchtrt_ext/register_sdpa.py
@@ -1,7 +1,7 @@
 import copy
 import logging
 import operator
-from typing import Callable, Sequence, Tuple
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Type
 
 import torch
 from torch_tensorrt.dynamo._settings import CompilationSettings
@@ -13,7 +13,7 @@
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
     clean_up_graph_after_modifications,
 )
-from transformers import Gemma3TextConfig
+from transformers import AutoConfig, Gemma3TextConfig
 
 from .sdpa_converter import *
 
@@ -34,52 +34,130 @@
     torch.ops.aten._scaled_dot_product_flash_attention.default,
 }
 
+from torch_tensorrt.dynamo.lowering.passes._aten_lowering_pass import (
+    get_lowering_pass_config,
+)
+
 
-def register_sdpa_pass_with_model_config(index: int = 0, model_config=None):
-    """
-    Register the SDPA replacement pass with a specific model configuration.
+def _process_sdpa_node(
+    gm: torch.fx.GraphModule,
+    node: torch.fx.Node,
+    settings: CompilationSettings,
+    sliding_window_size: Optional[int] = None,
+    use_gqa: bool = False,
+) -> torch.fx.GraphModule:
+    """Helper function to process SDPA nodes with common logic."""
+
+    if node.target == torch.ops.aten._scaled_dot_product_efficient_attention.default:
+        if len(node.args) == 7:
+            (
+                query,
+                key,
+                value,
+                attn_mask,
+                compute_log_sumexp,
+                dropout_p,
+                is_causal,
+            ) = node.args
+        elif len(node.args) == 5:
+            query, key, value, attn_mask, is_causal = node.args
+            dropout_p = 0.0
+        else:
+            raise ValueError(
+                f"Unexpected number of arguments for {node.target} in the graph"
+            )
+    elif node.target == torch.ops.aten._scaled_dot_product_flash_attention.default:
+        if len(node.args) == 6:
+            (
+                query,
+                key,
+                value,
+                dropout_p,
+                is_causal,
+                return_debug_mask,
+            ) = node.args
+        elif len(node.args) == 5:
+            query, key, value, dropout_p, is_causal = node.args
+        elif len(node.args) == 3:
+            query, key, value = node.args
+            dropout_p = 0.0
+            is_causal = True
+        else:
+            raise ValueError(
+                f"Unexpected number of arguments for {node.target} in the graph"
+            )
+    else:
+        return gm
 
-    Args:
-        model_config: The model configuration object (e.g., from transformers.AutoConfig)
-        index: Position in the lowering pass list (default: 0)
+    # Always set causal to True and generate attn_mask inside the sdpa operator
+    attn_mask = None
+    is_causal = True
+    dropout_p = 0.0
 
-    Example:
-        from transformers import AutoConfig
-        config = AutoConfig.from_pretrained("microsoft/DialoGPT-medium")
-        register_sdpa_pass_with_model_config(config)
-    """
-    from torch_tensorrt.dynamo.lowering.passes._aten_lowering_pass import (
-        _aten_lowering_pass,
-        _remove_lowering_pass,
+    logger.warning(
+        f"SDPA converter configuration: attn_mask={attn_mask}, dropout_p={dropout_p}, "
+        f"is_causal={is_causal}, sliding_window_size={sliding_window_size}, use_gqa={use_gqa}"
     )
 
-    # Create a new pass with the model configuration
-    @_aten_lowering_pass(index=index, model_config=model_config)
-    def replace_variants_of_sdpa_with_config(
-        gm: torch.fx.GraphModule, settings: CompilationSettings
-    ) -> torch.fx.GraphModule:
-        """Replace scaled_dot_product_attention with model-specific configuration"""
+    modified_input_args = (
+        query,
+        key,
+        value,
+        attn_mask,
+        dropout_p,
+        is_causal,
+    )
 
-        # Access the model configuration from the decorator parameters
-        from torch_tensorrt.dynamo.lowering.passes._aten_lowering_pass import (
-            get_lowering_pass_config,
+    # Create a new node with torch.nn.functional.scaled_dot_product_attention
+    with gm.graph.inserting_after(node):
+        new_node = gm.graph.call_function(
+            torch.nn.functional.scaled_dot_product_attention,
+            args=modified_input_args,
+            kwargs={
+                "scale": node.kwargs.get("scale", None),
+                "use_fp32_acc": settings.use_fp32_acc,
+                "sliding_window_size": sliding_window_size,
+            },
         )
 
-        config = get_lowering_pass_config(replace_variants_of_sdpa_with_config)
+        # Deep copy encounters RuntimeError: Cannot access data pointer of Tensor (e.g. FakeTensor, FunctionalTensor). So we use copy instead.
+        new_node.meta = copy.copy(node.meta)
+        # Check if there's a getitem node following this attention node
+        for user in list(node.users):
+            if user.op == "call_function" and user.target == operator.getitem:
+                # If the getitem is extracting the first element (the output tensor)
+                if user.args[1] == 0:
+                    # Replace all uses of the getitem with the new attention node
+                    user.replace_all_uses_with(new_node)
+                    new_node.meta["val"] = new_node.meta["val"][0]
+        # Replace all uses of the original node with the new node
+        node.replace_all_uses_with(new_node)
 
-        model_config = config.get("model_config", None)
-        layer_types = []
+    gm.graph.erase_node(node)
+    return gm
+
+
+def register_gemma3_sdpa_pass(index: int = 0, model_config: Any = None) -> None:
+    @_aten_lowering_pass(index=index, model_config=model_config)
+    def gemma3_sdpa_pass(
+        gm: torch.fx.GraphModule, settings: CompilationSettings
+    ) -> torch.fx.GraphModule:
+        """SDPA pass specifically for Gemma3 models with sliding window attention."""
+        config = get_lowering_pass_config(gemma3_sdpa_pass)
         sliding_window = None
-        # Extract model-specific parameters
-        if model_config is not None:
-            if isinstance(model_config, Gemma3TextConfig):
-                sliding_window = getattr(model_config, "sliding_window", None)
-                layer_types = getattr(model_config, "layer_types", None)
-                logger.info(f"Model config: {sliding_window=} {layer_types=}")
-        else:
+        layer_types = None
+        model_config = config.get("model_config", None)
+        if not isinstance(model_config, Gemma3TextConfig):
             logger.warning(
-                "No model configuration provided, using default SDPA replacement behavior"
+                f"Expected Gemma3TextConfig, got {type(model_config)}, will use default SDPA replacement instead"
+            )
+        else:
+            sliding_window = getattr(model_config, "sliding_window", None)
+            layer_types = getattr(model_config, "layer_types", None)
+            logger.debug(
+                f"got Gemma3 config: sliding_window={sliding_window}, layer_types={layer_types}"
             )
+
         index = 0
         for node in gm.graph.nodes:
             if node.op == "call_function" and node.target in REPLACEABLE_ATEN_OPS:
@@ -94,116 +172,37 @@ def replace_variants_of_sdpa_with_config(
                         sliding_window_size = sliding_window
                 index += 1
 
-                if (
-                    node.target
-                    == torch.ops.aten._scaled_dot_product_efficient_attention.default
-                ):
-                    if len(node.args) == 7:
-                        (
-                            query,
-                            key,
-                            value,
-                            attn_mask,
-                            compute_log_sumexp,
-                            dropout_p,
-                            is_causal,
-                        ) = node.args
-                    elif len(node.args) == 5:
-                        query, key, value, attn_mask, is_causal = node.args
-                        dropout_p = 0.0
-
-                    else:
-                        raise ValueError(
-                            f"Unexpected number of arguments for {node.target} in the graph"
-                        )
-                elif (
-                    node.target
-                    == torch.ops.aten._scaled_dot_product_flash_attention.default
-                ):
-                    if len(node.args) == 6:
-                        (
-                            query,
-                            key,
-                            value,
-                            dropout_p,
-                            is_causal,
-                            return_debug_mask,
-                        ) = node.args
-                    if len(node.args) == 5:
-                        query, key, value, dropout_p, is_causal = node.args
-                    elif len(node.args) == 3:
-                        query, key, value = node.args
-                        dropout_p = 0.0
-                        is_causal = True
-                    else:
-                        raise ValueError(
-                            f"Unexpected number of arguments for {node.target} in the graph"
-                        )
-
-                # always set_causal to True and generate attn_mask inside the sdpa operator, do not use the attn_mask from the transformers.
-                attn_mask = None
-                is_causal = True
-                dropout_p = 0.0
-
-                logger.warning(
-                    f"This current version of SDPA converter only supports {attn_mask=}, {dropout_p=} and {is_causal=} and {sliding_window_size=}  configuration. This could cause issues with accuracy for models with different configurations."
-                )
-                modified_input_args = (
-                    query,
-                    key,
-                    value,
-                    attn_mask,
-                    dropout_p,
-                    is_causal,
+                # Process the node
+                logger.debug(
+                    f"Applying Gemma3-specific SDPA replacement with {node.name=}, {node.target=}, {sliding_window_size=}"
                 )
-                # Create a new node with torch.nn.functional.scaled_dot_product_attention
-                # The input args is (query, key, value, attn_mask, dropout_p, is_causal). kwargs has scale
-                with gm.graph.inserting_after(node):
-                    new_node = gm.graph.call_function(
-                        torch.nn.functional.scaled_dot_product_attention,
-                        args=modified_input_args,
-                        kwargs={
-                            "scale": node.kwargs.get("scale", None),
-                            "use_fp32_acc": settings.use_fp32_acc,
-                            "sliding_window_size": sliding_window_size,
-                        },
-                    )
-
-                    # Deep copy encounters RuntimeError: Cannot access data pointer of Tensor (e.g. FakeTensor, FunctionalTensor). So we use copy instead.
-                    new_node.meta = copy.copy(node.meta)
-                    # Check if there's a getitem node following this attention node
-                    for user in list(node.users):
-                        if (
-                            user.op == "call_function"
-                            and user.target == operator.getitem
-                        ):
-                            # If the getitem is extracting the first element (the output tensor)
-                            if user.args[1] == 0:
-                                # Replace all uses of the getitem with the new attention node
-                                user.replace_all_uses_with(new_node)
-                                new_node.meta["val"] = new_node.meta["val"][0]
-                    # Replace all uses of the original node with the new node
-                    node.replace_all_uses_with(new_node)
-
-                gm.graph.erase_node(node)
-
-        # Clean up the graph
+                gm = _process_sdpa_node(gm, node, settings, sliding_window_size)
+
         clean_up_graph_after_modifications(gm)
+        logger.debug("Applied Gemma3-specific SDPA replacement")
+        return gm
 
-        if model_config:
-            logger.debug(
-                f"Replaced variants of scaled_dot_product_attention for {getattr(model_config, 'model_type', 'unknown')} model"
-            )
-        else:
-            logger.debug(
-                "Replaced variants of scaled_dot_product_attention with torch.nn.functional.scaled_dot_product_attention"
-            )
-        add_attn_mask_as_output = False
-        if add_attn_mask_as_output:
-            add_one_attn_mask_as_output(gm)
+
+def register_default_sdpa_pass(index: int = 0, model_config: Any = None) -> None:
+    @_aten_lowering_pass(index=index, model_config=model_config)
+    def default_sdpa_pass(
+        gm: torch.fx.GraphModule,
+        settings: CompilationSettings,
+    ) -> torch.fx.GraphModule:
+        """Default SDPA pass for models without specific implementations."""
+
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and node.target in REPLACEABLE_ATEN_OPS:
+                # Process the node with default logic
+                gm = _process_sdpa_node(gm, node, settings)
+
+        clean_up_graph_after_modifications(gm)
+        logger.debug("Applied default SDPA replacement")
         return gm
 
-    logger.info(
-        f"Registered SDPA pass with model config: {getattr(model_config, 'model_type', 'unknown')}"
-    )
-    return replace_variants_of_sdpa_with_config
+
+# Global registry for SDPA passes
+_SDPA_MAPPING: Dict[str, Callable] = {
+    "google/gemma-3-1b-it": register_gemma3_sdpa_pass,
+    "default": register_default_sdpa_pass,
+}