Fix the minor error during the kernel call

nvchenghaoz · nvchenghaoz · commit f83e6744cdf8 · 2025-06-25T13:30:30.000-07:00
Signed-off-by: nvchenghaoz &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
@@ -119,7 +119,7 @@ def grouped_sdpa(
         dropout_p=dropout_p,
         is_causal=is_causal,
         scale=scale,
-        logit_cap=logit_cap,
+        enable_gqa=True,
     )
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py
@@ -56,6 +56,7 @@ def _generate_mha(
     stage1_output_logsumexp = torch.empty(
         b, n_heads, num_blocks, device=device, dtype=torch.float32
     ) - float("inf")
+
     update_kv_cache[(b, n_kv_heads, 1)](
         k,
         v,
@@ -74,7 +75,13 @@ def _generate_mha(
     )
 
     HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
-    gqa_attention_kv_stage1[(b, n_heads, num_blocks)](
+    gqa_attention_kv_stage1[
+        (
+            b,
+            n_kv_heads,
+            num_blocks,
+        )
+    ](
         q,
         k_cache,
         v_cache,
@@ -382,6 +389,7 @@ def get_constants(cls, source_attn_node: Node) -> List[Constant]:
             scale = source_attn_node.args[6]
         else:
             scale = source_attn_node.kwargs.get("scale", None)
+
         # do a sanity check on the scale if it is not None, we only support the default scale
         # of 1/sqrt(head_dim) and so we should do an approximate check for that one
         if not isinstance(scale, float):