add test case

lanluo-nvidia · lanluo-nvidia · commit a65f0f1ae579 · 2025-08-14T20:19:06.000-07:00
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -116,7 +116,7 @@ def compile_torchtrt(model, input_ids, args):
             use_fp32_acc=use_fp32_acc,
             device=DEVICE,
             disable_tf32=True,
-            use_python_runtime=True,
+            use_python_runtime=False,
             debug=args.debug,
             offload_module_to_cpu=True,
             min_block_size=args.min_block_size,
diff --git a/tools/llm/test_trt_sdpa.py b/tools/llm/test_trt_sdpa.py
@@ -0,0 +1,68 @@
+import torch
+import torch_tensorrt
+from torch.export import Dim
+from torchtrt_ext import register_sdpa
+
+
+class SimpleNetwork(torch.nn.Module):
+    def __init__(self):
+        super(SimpleNetwork, self).__init__()
+
+    def forward(self, query, key, value, attn_mask):
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=False,
+            enable_math=False,
+            enable_mem_efficient=True,
+        ):
+            return torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask, 0.0, False, scale=0.0625
+            )
+
+
+dtype = torch.float32
+
+dyn_dim = Dim("dyn_dim", min=3, max=32)
+
+query = torch.randn((1, 4, 13, 256), dtype=dtype).cuda()
+key = torch.randn((1, 4, 13, 256), dtype=dtype).cuda()
+value = torch.randn((1, 4, 13, 256), dtype=dtype).cuda()
+attn_mask = torch.ones((13, 13), dtype=torch.bool).tril(diagonal=0).cuda()
+inputs = (query, key, value, attn_mask)
+
+model = SimpleNetwork().eval().cuda()
+output_pyt = model(*inputs)
+exp_program = torch.export.export(
+    model,
+    inputs,
+    strict=False,
+    dynamic_shapes={
+        "query": {2: dyn_dim},
+        "key": {2: dyn_dim},
+        "value": {2: dyn_dim},
+        "attn_mask": {0: dyn_dim, 1: dyn_dim},
+    },
+)
+DEBUG_LOGGING_DIR = "./debug_logs"
+with torch_tensorrt.dynamo.Debugger(
+    "graphs",
+    logging_dir=DEBUG_LOGGING_DIR,
+    capture_fx_graph_after=["complex_graph_detection"],
+    save_engine_profile=True,
+    profile_format="trex",
+    engine_builder_monitor=True,
+):
+    trt_model = torch_tensorrt.dynamo.compile(
+        exp_program,
+        inputs=inputs,
+        enabled_precisions={dtype},
+        min_block_size=1,
+        cache_built_engines=False,
+        reuse_cached_engines=False,
+        truncate_double=True,
+        use_python_runtime=False,
+    )
+    outputs_trt = trt_model(*inputs)
+    breakpoint()
+    assert torch.allclose(output_pyt, outputs_trt, rtol=1e-2, atol=1e-2)
+
+print("Done")
diff --git a/tools/llm/torchtrt_ext/register_sdpa.py b/tools/llm/torchtrt_ext/register_sdpa.py
@@ -89,9 +89,9 @@ def replace_variants_of_sdpa(
             logger.warning(
                 f"This current version of SDPA converter only supports attn_mask = None, dropout_p = 0.0 and is_causal = True configuration. This could cause issues with accuracy for models with different configurations."
             )
-            # TODO: lan to figure out why is_causal is always False in google/gemma-3-1b-it, as in the config file it should be every 5 sliding window layer followed by a full attention layer
-            # also to figure out why the attn_mask passed in from transformers is not working
-            modified_input_args = (query, key, value, None, dropout_p, is_causal)
+            # TODO: lan to figure out why the attn_mask passed in from transformers is not working
+            # modified_input_args = (query, key, value, None, dropout_p, True)
+            modified_input_args = (query, key, value, attn_mask, dropout_p, is_causal)
             # Create a new node with torch.nn.functional.scaled_dot_product_attention
             # The input args is (query, key, value, is_causal). kwargs has scale
             with gm.graph.inserting_after(node):
diff --git a/tools/llm/torchtrt_ext/sdpa_converter.py b/tools/llm/torchtrt_ext/sdpa_converter.py
@@ -161,51 +161,77 @@ def scaled_dot_product_attention(
             L = impl.shape.shape(ctx, target, source_ir, name + "_shape_0", query, 2)
         if S < 0:
             S = impl.shape.shape(ctx, target, source_ir, name + "_shape_1", key, 2)
-
         # generate the mask tensor
         if is_causal:
             tril_tensor = tril(ctx, target, source_ir, name + "_tril", L, S)
         else:
-            # hard code the sliding window size to 512 for now
-            tril_tensor = tril(ctx, target, source_ir, name + "_tril", L, S, 512)
             # TODO: lan to figure out why attn_mask passed in from transformers is not working
-            # tried both 2d and 4d, but both are not working, hence the following code is commented out
-            # assert len(attn_mask.shape) in [2, 4], f"attn_mask must be 2D or 4D, but got {attn_mask.shape=}"
-            # if len(attn_mask.shape) == 4:
-            #     if attn_mask.shape[0] != 1:
-            #         attn_mask = impl.slice.slice_op(ctx, target, source_ir, name + "_slice", attn_mask, 0, 0, 1, 1)
-            #     if attn_mask.shape[1] != 1:
-            #         attn_mask = impl.slice.slice_op(ctx, target, source_ir, name + "_slice", attn_mask, 1, 0, 1, 1)
-            #     attn_mask = impl.squeeze.squeeze(ctx, target, source_ir, name + "_squeeze", attn_mask, (0, 1))
-            # tril_tensor = attn_mask
-
-        temp_mask = impl.unary.logical_not(
-            ctx, target, source_ir, name + "_logical_not", tril_tensor
-        )
+            # tried both 2d and 4d, but both are not working
+            assert len(attn_mask.shape) in [
+                2,
+                4,
+            ], f"attn_mask must be 2D or 4D, but got {attn_mask.shape=}"
+            if len(attn_mask.shape) == 4:
+                if attn_mask.shape[0] != 1:
+                    attn_mask = impl.slice.slice_op(
+                        ctx, target, source_ir, name + "_slice", attn_mask, 0, 0, 1, 1
+                    )
+                if attn_mask.shape[1] != 1:
+                    attn_mask = impl.slice.slice_op(
+                        ctx, target, source_ir, name + "_slice", attn_mask, 1, 0, 1, 1
+                    )
+                attn_mask = impl.squeeze.squeeze(
+                    ctx, target, source_ir, name + "_squeeze", attn_mask, (0, 1)
+                )
+            tril_tensor = attn_mask
 
-        # This need_mask determines if we want to use the causal mask or not
-        # When KV caching is enabled, L = 1 and != S. In this case, we shouldn't use the causal mask.
-        # So need_mask will be all False values in this case.
-        # TODO: Implement more general case where L != 1 and S != L
-        need_mask = impl.elementwise.eq(ctx, target, source_ir, name + "_eq", L, S)
-        temp_mask = impl.elementwise.logical_and(
-            ctx, target, source_ir, name + "_logical_and", need_mask, temp_mask
-        )
-        temp_mask_casted = cast_trt_tensor(
-            ctx, temp_mask, query_dtype, name + "_casted_bool", target, source_ir
-        )
+        # generate attn_bias via where instead of (logical_and, sub, log) to see whether nan is related to this
+        attn_bias_via_where = True
+        if attn_bias_via_where:
+            attn_bias = impl.condition.where(
+                ctx,
+                target,
+                source_ir,
+                name + "_where",
+                torch.tensor(0.0, dtype=torch.float32).cuda(),
+                torch.tensor(-float("inf"), dtype=torch.float32).cuda(),
+                tril_tensor,
+            )
+        else:
+            temp_mask = impl.unary.logical_not(
+                ctx, target, source_ir, name + "_logical_not", tril_tensor
+            )
+            temp_mask = cast_trt_tensor(
+                ctx, temp_mask, trt.float32, name + "_casted_bool", target, source_ir
+            )
+            temp_mask = impl.elementwise.mul(
+                ctx, target, source_ir, name + "_mul_-inf", temp_mask, float("-inf")
+            )
+            attn_bias = temp_mask
 
-        one_minus_temp_mask = impl.elementwise.sub(
-            ctx,
-            target,
-            source_ir,
-            name + "_one_minus_temp_mask",
-            1.0,
-            temp_mask_casted,
-        )
-        attn_bias = impl.unary.log(
-            ctx, target, source_ir, name + "_log", one_minus_temp_mask
-        )
+            # This need_mask determines if we want to use the causal mask or not
+            # When KV caching is enabled, L = 1 and != S. In this case, we shouldn't use the causal mask.
+            # So need_mask will be all False values in this case.
+            # TODO: Implement more general case where L != 1 and S != L
+            need_mask = impl.elementwise.eq(ctx, target, source_ir, name + "_eq", L, S)
+            temp_mask = impl.elementwise.logical_and(
+                ctx, target, source_ir, name + "_logical_and", need_mask, temp_mask
+            )
+            temp_mask_casted = cast_trt_tensor(
+                ctx, temp_mask, query_dtype, name + "_casted_bool", target, source_ir
+            )
+
+            one_minus_temp_mask = impl.elementwise.sub(
+                ctx,
+                target,
+                source_ir,
+                name + "_one_minus_temp_mask",
+                1.0,
+                temp_mask_casted,
+            )
+            attn_bias = impl.unary.log(
+                ctx, target, source_ir, name + "_log", one_minus_temp_mask
+            )
 
     scaled_add_attn_bias = impl.elementwise.add(
         ctx, target, source_ir, name + "_attn_bias_add", mm, attn_bias
diff --git a/tools/llm/utils.py b/tools/llm/utils.py
@@ -179,7 +179,6 @@ def generate_with_dynamic_cache(model, input_seq, max_output_seq_length, eos_tok
     num_tokens_generated = 0
     kv_cache = get_zeroed_dynamic_cache_inputs(model)
     last_position_id = position_ids[-1, -1].item()
-    breakpoint()
     while num_tokens_generated < num_output_tokens:
         is_generate = False if input_seq.shape[1] > 1 else True
         position_ids = (