Add the softcap import and related transform

nvchenghaoz · nvchenghaoz · commit 7228d980ea3a · 2025-06-25T10:47:36.000-07:00
Signed-off-by: Chenghao Zhang &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
@@ -40,27 +40,58 @@ def scaled_dot_product_attention(
     dropout_p: float = 0.0,
     is_causal: bool = False,
     scale: Optional[float] = None,
+    logit_cap: Optional[float] = None,
 ) -> torch.Tensor:
     """A carbon copy of torch.nn.functional.scaled_dot_product_attention as custom op.
 
     Using this custom op instead of using the functional directly ensures consistent representation
     of the vanilla sdpa in a graph.
     """
 
-    return F.scaled_dot_product_attention(
-        query.contiguous(),
-        key.contiguous(),
-        value.contiguous(),
-        attn_mask=attn_mask,
-        dropout_p=dropout_p,
-        is_causal=is_causal,
-        scale=scale,
-    )
+    # Handle soft capping by applying it manually since F.scaled_dot_product_attention
+    # may not support soft_cap parameter
+    if logit_cap is not None:
+        # Apply manual soft capping to the attention scores
+        # First compute raw attention scores
+        d_k = query.size(-1)
+        if scale is None:
+            scale = 1.0 / (d_k**0.5)
+
+        # Compute attention scores
+        scores = torch.matmul(query, key.transpose(-2, -1)) * scale
+
+        # Apply soft capping: tanh(scores / logit_cap) * logit_cap
+        scores = torch.tanh(scores / logit_cap) * logit_cap
+
+        if attn_mask is not None:
+            scores += attn_mask
+
+        # Apply softmax
+        attn_weights = F.softmax(scores, dim=-1)
+
+        # Apply dropout if specified
+        if dropout_p > 0.0:
+            attn_weights = F.dropout(attn_weights, p=dropout_p, training=torch.is_grad_enabled())
+
+        # Apply attention to values
+        output = torch.matmul(attn_weights, value)
+        return output.contiguous()
+    else:
+        # Use standard SDPA when no soft capping
+        return F.scaled_dot_product_attention(
+            query.contiguous(),
+            key.contiguous(),
+            value.contiguous(),
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=scale,
+        )
 
 
 @scaled_dot_product_attention.register_fake
 def scaled_dot_product_attention_fake(
-    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, logit_cap=None
 ):
     """Fake implementation of scaled_dot_product_attention."""
     return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
@@ -75,18 +106,20 @@ def grouped_sdpa(
     dropout_p: float = 0.0,
     is_causal: bool = False,
     scale: Optional[float] = None,
+    logit_cap: Optional[float] = None,
 ) -> torch.Tensor:
     """SDPA attention that can handle GQA."""
 
-    return F.scaled_dot_product_attention(
+    # Use our custom scaled_dot_product_attention that supports soft capping
+    return scaled_dot_product_attention(
         query.contiguous(),
         key.contiguous(),
         value.contiguous(),
         attn_mask=attn_mask,
         dropout_p=dropout_p,
         is_causal=is_causal,
         scale=scale,
-        enable_gqa=True,
+        logit_cap=logit_cap,
     )
 
 
@@ -99,6 +132,7 @@ def grouped_sdpa_fake(
     dropout_p=0.0,
     is_causal=False,
     scale=None,
+    logit_cap=None,
 ):
     """Fake implementation of grouped SDPA."""
     return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
@@ -113,6 +147,7 @@ def bsnd_grouped_sdpa(
     dropout_p: float = 0.0,
     is_causal: bool = False,
     scale: Optional[float] = None,
+    logit_cap: Optional[float] = None,
 ) -> torch.Tensor:
     """Attention that assumes the input layout is bsnd.
 
@@ -124,15 +159,15 @@ def bsnd_grouped_sdpa(
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
-    out = grouped_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
+    out = grouped_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale, logit_cap)
 
     # let's transpose back to bnsd
     return out.transpose(1, 2).contiguous()
 
 
 @bsnd_grouped_sdpa.register_fake
 def bsnd_grouped_sdpa_fake(
-    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, logit_cap=None
 ):
     """Fake implementation of bnsd grouped SDPA."""
     return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py
@@ -40,6 +40,7 @@ def _generate_mha(
     cache_locs: torch.Tensor,
     input_pos: torch.Tensor,
     scale: float,
+    logit_cap: Optional[float],
     out: torch.Tensor,
 ):
     b, (n_heads, q_d_head) = q.shape[0], q.shape[-2:]
@@ -55,7 +56,6 @@ def _generate_mha(
     stage1_output_logsumexp = torch.empty(
         b, n_heads, num_blocks, device=device, dtype=torch.float32
     ) - float("inf")
-
     update_kv_cache[(b, n_kv_heads, 1)](
         k,
         v,
@@ -74,13 +74,7 @@ def _generate_mha(
     )
 
     HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
-    gqa_attention_kv_stage1[
-        (
-            b,
-            n_kv_heads,
-            num_blocks,
-        )
-    ](
+    gqa_attention_kv_stage1[(b, n_heads, num_blocks)](
         q,
         k_cache,
         v_cache,
@@ -97,6 +91,7 @@ def _generate_mha(
         v_d_head,
         SEQ_BLOCK_SIZE,
         HEAD_BLOCK_SIZE,
+        LOGIT_CAP=logit_cap,
     )
     attention_kv_stage2[(b, n_heads, 1)](
         stage1_output_values,
@@ -122,6 +117,7 @@ def _flattened_context_mha(
     seq_start: torch.Tensor,
     scale: float,
     out: torch.Tensor,
+    logit_cap: Optional[float],
 ) -> None:
     # NOTE: s_total == sum(seq_len)
     s_total, n_heads, q_d_head = q.shape
@@ -166,6 +162,7 @@ def _flattened_context_mha(
         SEQ_BLOCK,
         max_cache_seq_len,
         num_stages=2,
+        LOGIT_CAP=logit_cap,
     )
 
 
@@ -187,6 +184,7 @@ def flattened_mha_with_cache(
     # <none>
     # CONSTANTS
     scale: Optional[float],
+    logit_cap: Optional[float],
 ) -> torch.Tensor:
     """Flattened MHA with cache that takes q, k, v in BSND layout.
 
@@ -223,7 +221,7 @@ def flattened_mha_with_cache(
     y = q.new_empty(*bs_view, num_heads, v_head_dim).contiguous()
     if s == 1:
         # generate-only phase
-        _generate_mha(q, k, v, k_cache, v_cache, cache_loc, input_pos, scale, y)
+        _generate_mha(q, k, v, k_cache, v_cache, cache_loc, input_pos, scale, logit_cap, y)
     else:
         # mixed context + generate phase
         _flattened_context_mha(
@@ -237,7 +235,8 @@ def flattened_mha_with_cache(
             seq_len,
             seq_start,
             scale,
-            y,
+            out=y,
+            logit_cap=logit_cap,
         )
 
     return y.view(*output_shape)
@@ -255,6 +254,7 @@ def flattened_mha_fake(
     k_cache: torch.Tensor,
     v_cache: torch.Tensor,
     scale: Optional[float],
+    logit_cap: Optional[float],
 ):
     return q.new_empty(*q.shape[:-1], v.shape[-1]).contiguous()
 
@@ -382,13 +382,18 @@ def get_constants(cls, source_attn_node: Node) -> List[Constant]:
             scale = source_attn_node.args[6]
         else:
             scale = source_attn_node.kwargs.get("scale", None)
-
         # do a sanity check on the scale if it is not None, we only support the default scale
         # of 1/sqrt(head_dim) and so we should do an approximate check for that one
         if not isinstance(scale, float):
             ad_logger.warning("Provided scale is not a float, Using default scale instead.")
             scale = None
 
+        if len(source_attn_node.args) > 7:
+            logit_cap = source_attn_node.args[7]
+        else:
+            logit_cap = source_attn_node.kwargs.get("logit_cap", None)
+
         return [
             scale,  # softmax scale
+            logit_cap,  # soft capping scale
         ]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py
@@ -128,8 +128,10 @@ def gqa_attention_kv_stage1(
     1. Fetch the K-cache from 0 to input_pos
     2. Fetch the V-cache from 0 to input_pos
     3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
-    4. S = softmax(A)
-    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    4. A = A * scale
+    5. A = A * logit_cap if logit_cap is not None
+    6. S = softmax(A)
+    7. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
     """
     # Assume KV-cache layout: [Batch, Seq, Head, Dim]
     # A program is responsible for 1 batch, 1 head and a block of sequences.
@@ -577,6 +579,7 @@ def context_attention_kv_flattened(
     V_D_HEAD: tl.constexpr,  # Dimension of each value head.
     SEQ_BLOCK: tl.constexpr,
     MAX_SEQ_LENGTH: tl.constexpr,
+    LOGIT_CAP: tl.constexpr = None,
 ):
     """Kernel for context phase.
 
@@ -645,6 +648,8 @@ def context_attention_kv_flattened(
             (seq_offsets[:, None] + kv_position) >= kv_seq_offsets[None, :], qk, float("-inf")
         )
         qk *= SCALE
+        if LOGIT_CAP is not None:
+            qk = LOGIT_CAP * tanh(qk / LOGIT_CAP)
         # rowmax
         m_ij = tl.maximum(tl.max(qk, 1), lse_i)
         p = tl.exp(qk - m_ij[:, None])
diff --git a/tensorrt_llm/_torch/auto_deploy/models/decilm.py b/tensorrt_llm/_torch/auto_deploy/models/decilm.py
@@ -7,7 +7,11 @@
 
 def _from_pretrained_patched(pretrained_model_name_or_path, **kwargs):
     print(str(pretrained_model_name_or_path))
-    if re.search(r"Llama-3_(?:1|3)-Nemotron-(?:Ultra|Super)", str(pretrained_model_name_or_path)):
+
+    # Use the eager attention implementation for Gemma-2 models to import the soft logit capping ops.
+    if re.search(
+        r"Llama-3_(?:1|3)-Nemotron-(?:Ultra|Super)", str(pretrained_model_name_or_path)
+    ) or re.search(r"gemma-2", str(pretrained_model_name_or_path), re.IGNORECASE):
         kwargs["attn_implementation"] = "eager"
     return _orig_from_pretrained(pretrained_model_name_or_path, **kwargs)
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/attention.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/attention.py
@@ -293,7 +293,8 @@ def _match_eager_attention_pattern(final_matmul_node: Node) -> Optional[Dict[str
     Match the eager attention pattern starting from the final matmul node.
 
     The pattern is:
-    transpose -> matmul -> mul/div -> (optional) add -> (optional) to -> softmax -> (optional) to -> dropout -> matmul
+    transpose -> matmul -> mul/div -> (optional) div -> tanh -> mul (soft capping)
+    -> (optional) add -> (optional) to -> softmax -> (optional) to -> dropout -> matmul
 
     Returns a dictionary with information about the match or None if no match.
     """
@@ -352,21 +353,51 @@ def _match_eager_attention_pattern(final_matmul_node: Node) -> Optional[Dict[str
         prev_node = prev_node.args[0]
 
     # Check for attention mask pattern (add node)
+    attn_mask = None
     if is_op(prev_node, torch.ops.aten.add):
         add_node = prev_node
         attn_mask = add_node.args[1]  # Second arg is the mask
 
-        # The add should have a mul or div node as its first argument
+        # The add should have input as its first argument
         if len(add_node.args) < 1:
             return None
 
-        scaling_node = add_node.args[0]
-        if not (is_op(scaling_node, torch.ops.aten.mul) or is_op(scaling_node, torch.ops.aten.div)):
-            return None
-    elif is_op(prev_node, torch.ops.aten.mul) or is_op(prev_node, torch.ops.aten.div):
-        # No mask case - the softmax input is directly the mul or div node
+        prev_node = add_node.args[0]
+
+    # Check for optional soft capping pattern: div -> tanh -> mul
+    logit_cap = None
+    if is_op(prev_node, torch.ops.aten.mul):
+        # Check if this mul is part of soft capping (mul after tanh)
+        if len(prev_node.args) >= 2:
+            mul_input = prev_node.args[0]
+            soft_cap_mul_factor = prev_node.args[1]
+
+            # Check if the input to mul is tanh
+            if is_op(mul_input, torch.ops.aten.tanh):
+                if len(mul_input.args) >= 1:
+                    tanh_input = mul_input.args[0]
+
+                    # Check if the input to tanh is div (completing the soft cap pattern)
+                    if is_op(tanh_input, torch.ops.aten.div):
+                        if len(tanh_input.args) >= 2:
+                            div_input = tanh_input.args[0]
+                            soft_cap_div_factor = tanh_input.args[1]
+
+                            # Verify that the div and mul factors are the same (soft cap scale)
+                            if isinstance(soft_cap_div_factor, (float, int)) and isinstance(
+                                soft_cap_mul_factor, (float, int)
+                            ):
+                                if abs(soft_cap_div_factor - soft_cap_mul_factor) < 1e-6:
+                                    logit_cap = soft_cap_div_factor
+                                    prev_node = div_input
+                            elif soft_cap_div_factor == soft_cap_mul_factor:
+                                # Same node/tensor used for both operations
+                                logit_cap = soft_cap_div_factor
+                                prev_node = div_input
+
+    # Now prev_node should be the scaling operation (mul or div)
+    if is_op(prev_node, torch.ops.aten.mul) or is_op(prev_node, torch.ops.aten.div):
         scaling_node = prev_node
-        attn_mask = None
     else:
         return None
 
@@ -422,6 +453,10 @@ def _match_eager_attention_pattern(final_matmul_node: Node) -> Optional[Dict[str
     if attn_mask is not None:
         match_info["attn_mask"] = attn_mask
 
+    # Add soft cap scale if it exists
+    if logit_cap is not None:
+        match_info["logit_cap"] = logit_cap
+
     return match_info
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -182,11 +182,10 @@ def __call__(self, cm: CachedSequenceInterface) -> GraphModule:
                 from .library import visualize_namespace
 
                 visualize_namespace(egm, args=cm.args, dynamic_shapes=cm.dynamic_shapes)
+            except ImportError:
                 ad_logger.warning(
-                    "Please run `pip install -r examples/auto_deploy/requirements.txt` to visualize"
-                    " the graph."
+                    "Please run `pip install -r examples/auto_deploy/requirements.txt` to visualize the graph."
                 )
-            except ImportError:
                 pass
 
         ############################################################################################