pytorch
diff --git a/‎py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
Lines changed: 23 additions & 3 deletions b/‎py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
Lines changed: 23 additions & 3 deletions
diff --git a/‎tools/llm/run_llm.py
Lines changed: 1 addition & 0 deletions b/‎tools/llm/run_llm.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/llm/torchtrt_ext/register_sdpa.py
Lines changed: 165 additions & 123 deletions b/‎tools/llm/torchtrt_ext/register_sdpa.py
Lines changed: 165 additions & 123 deletions
@@ -1,5 +1,5 @@
 import logging
-from typing import Callable, Optional, Sequence, Union
+from typing import Any, Callable, Optional, Sequence, Union
 
 import torch
 from torch_tensorrt.dynamo._settings import CompilationSettings
@@ -55,20 +55,28 @@
 def _aten_lowering_pass(
     *args: LoweringPassSignature,
     index: Optional[int] = None,
+    **kwargs: Any,
 ) -> Union[
     LoweringPassSignature, Callable[[LoweringPassSignature], LoweringPassSignature]
 ]:
     """Adds a lowering pass to the registry, at a specified index if desired
 
     If no index is specified, the lowering pass is inserted at the end of the list
+
+    Additional keyword arguments can be passed to configure the lowering pass behavior.
+    These will be stored as metadata on the pass function.
     """
 
     def add_lowering_pass(
         lowering_pass: LoweringPassSignature,
     ) -> LoweringPassSignature:
+        # Store additional parameters as metadata on the function
+        if kwargs:
+            lowering_pass._lowering_pass_config = kwargs
+
         ATEN_POST_LOWERING_PASSES.add_pass_with_index(lowering_pass, index)
         logger.debug(
-            f"Added lowering pass {lowering_pass} to list at index {index}, current passlist: {ATEN_POST_LOWERING_PASSES}"
+            f"Added lowering pass {lowering_pass} to list at index {index} with config {kwargs}, current passlist: {ATEN_POST_LOWERING_PASSES}"
         )
         return lowering_pass
 
@@ -83,7 +91,7 @@ def add_lowering_pass(
                 f"aten_lowering_pass decorator called with invalid arguments {args} "
                 "To specify an index to insert the pass, use the keyword 'index='"
             )
-    # If no arguments are specified, the decorator was called with an index keyword
+    # If no arguments are specified, the decorator was called with keyword arguments
     else:
         return add_lowering_pass
 
@@ -97,6 +105,18 @@ def _remove_lowering_pass(*, index: int) -> None:
     return
 
 
+def get_lowering_pass_config(lowering_pass: LoweringPassSignature) -> dict[str, Any]:
+    """Get the configuration parameters for a lowering pass function
+
+    Args:
+        lowering_pass: The lowering pass function
+
+    Returns:
+        Dictionary containing the configuration parameters, or empty dict if none
+    """
+    return getattr(lowering_pass, "_lowering_pass_config", {})
+
+
 def post_lowering(
     gm: torch.fx.GraphModule, settings: CompilationSettings = CompilationSettings()
 ) -> torch.fx.GraphModule:
 
@@ -58,6 +58,7 @@ def get_model(args):
             .eval()
             .cuda()
         )
+        register_sdpa.register_sdpa_pass_with_model_config(model_config=model.config)
 
     if args.precision == "FP16":
         model = model.to(torch.float16)
 
@@ -13,6 +13,7 @@
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
     clean_up_graph_after_modifications,
 )
+from transformers import Gemma3TextConfig
 
 from .sdpa_converter import *
 
@@ -34,134 +35,175 @@
 }
 
 
-@_aten_lowering_pass
-def replace_variants_of_sdpa(
-    gm: torch.fx.GraphModule, settings: CompilationSettings
-) -> torch.fx.GraphModule:
-    """Replace scaled_dot_product_attention with an equivalent
-    implementation which can be accurately converted to TRT
+def register_sdpa_pass_with_model_config(index: int = 0, model_config=None):
     """
+    Register the SDPA replacement pass with a specific model configuration.
 
-    for node in gm.graph.nodes:
-        attn_mask = None
-        is_causal = False
-        if node.op == "call_function" and node.target in REPLACEABLE_ATEN_OPS:
-            if (
-                node.target
-                == torch.ops.aten._scaled_dot_product_efficient_attention.default
-            ):
-                if len(node.args) == 7:
-                    (
-                        query,
-                        key,
-                        value,
-                        attn_mask,
-                        compute_log_sumexp,
-                        dropout_p,
-                        is_causal,
-                    ) = node.args
-                elif len(node.args) == 5:
-                    query, key, value, attn_mask, is_causal = node.args
-                    dropout_p = 0.0
-
-                else:
-                    raise ValueError(
-                        f"Unexpected number of arguments for {node.target} in the graph"
-                    )
-            elif (
-                node.target
-                == torch.ops.aten._scaled_dot_product_flash_attention.default
-            ):
-                if len(node.args) == 6:
-                    (
-                        query,
-                        key,
-                        value,
-                        dropout_p,
-                        is_causal,
-                        return_debug_mask,
-                    ) = node.args
-                if len(node.args) == 5:
-                    query, key, value, dropout_p, is_causal = node.args
-                elif len(node.args) == 3:
-                    query, key, value = node.args
-                    dropout_p = 0.0
-                    is_causal = True
-                else:
-                    raise ValueError(
-                        f"Unexpected number of arguments for {node.target} in the graph"
-                    )
+    Args:
+        model_config: The model configuration object (e.g., from transformers.AutoConfig)
+        index: Position in the lowering pass list (default: 0)
+
+    Example:
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained("microsoft/DialoGPT-medium")
+        register_sdpa_pass_with_model_config(config)
+    """
+    from torch_tensorrt.dynamo.lowering.passes._aten_lowering_pass import (
+        _aten_lowering_pass,
+        _remove_lowering_pass,
+    )
 
+    # Create a new pass with the model configuration
+    @_aten_lowering_pass(index=index, model_config=model_config)
+    def replace_variants_of_sdpa_with_config(
+        gm: torch.fx.GraphModule, settings: CompilationSettings
+    ) -> torch.fx.GraphModule:
+        """Replace scaled_dot_product_attention with model-specific configuration"""
+
+        # Access the model configuration from the decorator parameters
+        from torch_tensorrt.dynamo.lowering.passes._aten_lowering_pass import (
+            get_lowering_pass_config,
+        )
+
+        config = get_lowering_pass_config(replace_variants_of_sdpa_with_config)
+
+        model_config = config.get("model_config", None)
+        layer_types = []
+        sliding_window = None
+        # Extract model-specific parameters
+        if model_config is not None:
+            if isinstance(model_config, Gemma3TextConfig):
+                sliding_window = getattr(model_config, "sliding_window", None)
+                layer_types = getattr(model_config, "layer_types", None)
+                logger.info(f"Model config: {sliding_window=} {layer_types=}")
+        else:
             logger.warning(
-                f"This current version of SDPA converter only supports attn_mask = {attn_mask}, dropout_p = {dropout_p} and is_causal = {is_causal} configuration. This could cause issues with accuracy for models with different configurations."
+                "No model configuration provided, using default SDPA replacement behavior"
             )
-            modified_input_args = (query, key, value, attn_mask, dropout_p, is_causal)
-            # Create a new node with torch.nn.functional.scaled_dot_product_attention
-            # The input args is (query, key, value, attn_mask, dropout_p, is_causal). kwargs has scale
-            with gm.graph.inserting_after(node):
-                new_node = gm.graph.call_function(
-                    torch.nn.functional.scaled_dot_product_attention,
-                    args=modified_input_args,
-                    kwargs={
-                        "scale": node.kwargs.get("scale", None),
-                        "use_fp32_acc": settings.use_fp32_acc,
-                    },
+        index = 0
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and node.target in REPLACEABLE_ATEN_OPS:
+                sliding_window_size = None
+                if (
+                    sliding_window is not None
+                    and sliding_window > 0
+                    and layer_types is not None
+                    and index < len(layer_types)
+                ):
+                    if layer_types[index] == "sliding_attention":
+                        sliding_window_size = sliding_window
+                index += 1
+
+                if (
+                    node.target
+                    == torch.ops.aten._scaled_dot_product_efficient_attention.default
+                ):
+                    if len(node.args) == 7:
+                        (
+                            query,
+                            key,
+                            value,
+                            attn_mask,
+                            compute_log_sumexp,
+                            dropout_p,
+                            is_causal,
+                        ) = node.args
+                    elif len(node.args) == 5:
+                        query, key, value, attn_mask, is_causal = node.args
+                        dropout_p = 0.0
+
+                    else:
+                        raise ValueError(
+                            f"Unexpected number of arguments for {node.target} in the graph"
+                        )
+                elif (
+                    node.target
+                    == torch.ops.aten._scaled_dot_product_flash_attention.default
+                ):
+                    if len(node.args) == 6:
+                        (
+                            query,
+                            key,
+                            value,
+                            dropout_p,
+                            is_causal,
+                            return_debug_mask,
+                        ) = node.args
+                    if len(node.args) == 5:
+                        query, key, value, dropout_p, is_causal = node.args
+                    elif len(node.args) == 3:
+                        query, key, value = node.args
+                        dropout_p = 0.0
+                        is_causal = True
+                    else:
+                        raise ValueError(
+                            f"Unexpected number of arguments for {node.target} in the graph"
+                        )
+
+                # always set_causal to True and generate attn_mask inside the sdpa operator, do not use the attn_mask from the transformers.
+                attn_mask = None
+                is_causal = True
+                dropout_p = 0.0
+
+                logger.warning(
+                    f"This current version of SDPA converter only supports {attn_mask=}, {dropout_p=} and {is_causal=} and {sliding_window_size=}  configuration. This could cause issues with accuracy for models with different configurations."
                 )
+                modified_input_args = (
+                    query,
+                    key,
+                    value,
+                    attn_mask,
+                    dropout_p,
+                    is_causal,
+                )
+                # Create a new node with torch.nn.functional.scaled_dot_product_attention
+                # The input args is (query, key, value, attn_mask, dropout_p, is_causal). kwargs has scale
+                with gm.graph.inserting_after(node):
+                    new_node = gm.graph.call_function(
+                        torch.nn.functional.scaled_dot_product_attention,
+                        args=modified_input_args,
+                        kwargs={
+                            "scale": node.kwargs.get("scale", None),
+                            "use_fp32_acc": settings.use_fp32_acc,
+                            "sliding_window_size": sliding_window_size,
+                        },
+                    )
+
+                    # Deep copy encounters RuntimeError: Cannot access data pointer of Tensor (e.g. FakeTensor, FunctionalTensor). So we use copy instead.
+                    new_node.meta = copy.copy(node.meta)
+                    # Check if there's a getitem node following this attention node
+                    for user in list(node.users):
+                        if (
+                            user.op == "call_function"
+                            and user.target == operator.getitem
+                        ):
+                            # If the getitem is extracting the first element (the output tensor)
+                            if user.args[1] == 0:
+                                # Replace all uses of the getitem with the new attention node
+                                user.replace_all_uses_with(new_node)
+                                new_node.meta["val"] = new_node.meta["val"][0]
+                    # Replace all uses of the original node with the new node
+                    node.replace_all_uses_with(new_node)
+
+                gm.graph.erase_node(node)
+
+        # Clean up the graph
+        clean_up_graph_after_modifications(gm)
+
+        if model_config:
+            logger.debug(
+                f"Replaced variants of scaled_dot_product_attention for {getattr(model_config, 'model_type', 'unknown')} model"
+            )
+        else:
+            logger.debug(
+                "Replaced variants of scaled_dot_product_attention with torch.nn.functional.scaled_dot_product_attention"
+            )
+        add_attn_mask_as_output = False
+        if add_attn_mask_as_output:
+            add_one_attn_mask_as_output(gm)
+        return gm
 
-                # Deep copy encounters RuntimeError: Cannot access data pointer of Tensor (e.g. FakeTensor, FunctionalTensor). So we use copy instead.
-                new_node.meta = copy.copy(node.meta)
-                # Check if there's a getitem node following this attention node
-                for user in list(node.users):
-                    if user.op == "call_function" and user.target == operator.getitem:
-                        # If the getitem is extracting the first element (the output tensor)
-                        if user.args[1] == 0:
-                            # Replace all uses of the getitem with the new attention node
-                            user.replace_all_uses_with(new_node)
-                            new_node.meta["val"] = new_node.meta["val"][0]
-                # Replace all uses of the original node with the new node
-                node.replace_all_uses_with(new_node)
-
-            gm.graph.erase_node(node)
-
-    # Clean up the graph
-    clean_up_graph_after_modifications(gm)
-
-    logger.debug(
-        "Replaced variants of scaled_dot_product_attention with torch.nn.functional.scaled_dot_product_attention"
+    logger.info(
+        f"Registered SDPA pass with model config: {getattr(model_config, 'model_type', 'unknown')}"
     )
-    add_attn_mask_as_output = False
-    if add_attn_mask_as_output:
-        add_one_attn_mask_as_output(gm)
-    return gm
-
-
-# try to add one of the attn_mask as output, so that I can actually see the shape and value in the generation phase.
-def add_one_attn_mask_as_output(gm: torch.fx.GraphModule):
-    import torch.utils._pytree as pytree
-    from cache_utils import create_random_output_tensors
-
-    attn_mask_node = None
-    for node in gm.graph.nodes:
-        if (
-            node.op == "call_function"
-            and node.target == torch.nn.functional.scaled_dot_product_attention
-        ):
-            attn_mask_node = node.args[3]
-            break
-
-    output_node = next(node for node in gm.graph.nodes if node.op == "output")
-
-    current_outputs = output_node.args[0]
-    if isinstance(current_outputs, tuple):
-        new_outputs = current_outputs + (attn_mask_node,)
-    else:
-        new_outputs = (current_outputs, attn_mask_node)
-    output_node.args = new_outputs
-    gm.graph.output(new_outputs)
-    gm.graph.erase_node(output_node)
-
-    gm = clean_up_graph_after_modifications(gm)
-    new_output_tensors = create_random_output_tensors(new_outputs)
-    new_out_spec = pytree.tree_flatten(new_output_tensors)[1]
-    gm._out_spec = new_out_spec
-    return gm
+    return replace_variants_of_sdpa_with_config
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ def get_model(args):`
`58`	`58`	`.eval()`
`59`	`59`	`.cuda()`
`60`	`60`	`)`
	`61`	`+ register_sdpa.register_sdpa_pass_with_model_config(model_config=model.config)`
`61`	`62`
`62`	`63`	`if args.precision == "FP16":`
`63`	`64`	`model = model.to(torch.float16)`