Revert "[None] Add the torch source implementation for new params. (#89)"

lucaslie · lucaslie · commit 810df7d71f64 · 2025-07-21T07:25:16.000-07:00
This reverts commit c245cf3.
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
@@ -7,12 +7,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-
-def _apply_logit_softcapping(attn_scores: torch.Tensor, logit_cap: Optional[float]) -> torch.Tensor:
-    """Apply logit softcapping using the formula: logit_cap * tanh(logits / logit_cap)"""
-    if logit_cap is not None and logit_cap > 0.0:
-        return logit_cap * torch.tanh(attn_scores / logit_cap)
-    return attn_scores
+# TODO (nvchenghaoz): Remove related kernels once we have a backend-specific implementation for attention.
 
 
 @torch.library.custom_op("auto_deploy::torch_attention_repeat_kv", mutates_args=())
@@ -82,86 +77,19 @@ def grouped_sdpa(
     dropout_p: float = 0.0,
     is_causal: bool = False,
     scale: Optional[float] = None,
-    sinks: Optional[torch.Tensor] = None,
-    sliding_window: Optional[int] = None,
-    logit_cap: Optional[float] = None,
 ) -> torch.Tensor:
-    """SDPA attention that can handle GQA. Expects bnsd format inputs."""
-    b, n_heads, s_q, head_dim = query.shape  # bnsd format: [batch, num_heads, seq_len, head_dim]
-    _, n_kv_heads, s_k, _ = key.shape  # bnsd format: [batch, num_kv_heads, seq_len, head_dim]
-
-    # Inputs are already in bnsd format, no need to transpose
-    query_t = query  # [b, n_heads, s_q, head_dim]
-    key_t = key  # [b, n_kv_heads, s_k, head_dim]
-    value_t = value  # [b, n_kv_heads, s_k, v_head_dim]
-
-    # Handle GQA by repeating KV if needed
-    if n_heads != n_kv_heads:
-        n_rep = n_heads // n_kv_heads
-        key_t = repeat_kv(key_t, n_rep)  # [b, n_heads, s_k, head_dim]
-        value_t = repeat_kv(value_t, n_rep)  # [b, n_heads, s_k, v_head_dim]
-
-    # Set scale
-    if scale is None:
-        scale = 1.0 / math.sqrt(head_dim)
-
-    # Compute attention scores: Q @ K^T
-    attn_scores = torch.matmul(query_t, key_t.transpose(-2, -1)) * scale  # [b, n_heads, s_q, s_k]
-
-    # Apply attention mask if provided
-    if attn_mask is not None:
-        attn_scores = attn_scores + attn_mask
-
-    # Apply causal mask if specified
-    if is_causal:
-        causal_mask = torch.triu(
-            torch.ones(s_q, s_k, device=query.device, dtype=torch.bool),
-            diagonal=s_k - s_q + 1,
-        )
-        attn_scores.masked_fill_(causal_mask.unsqueeze(0).unsqueeze(0), float("-inf"))
-
-    # Apply sliding window mask if specified
-    if sliding_window is not None and sliding_window > 0:
-        # Create sliding window mask: each query position i can only attend to keys in [i-window_size+1, i]
-        query_positions = torch.arange(s_q, device=query.device)  # [s_q]
-        key_positions = torch.arange(s_k, device=query.device)  # [s_k]
-
-        # Create position difference matrix: query_pos - key_pos
-        pos_diff = query_positions.unsqueeze(1) - key_positions.unsqueeze(0)  # [s_q, s_k]
-
-        # Sliding window mask: allow attention only if 0 <= pos_diff < sliding_window_size
-        sliding_window_mask = (pos_diff < 0) | (pos_diff >= sliding_window)  # [s_q, s_k]
-        attn_scores.masked_fill_(sliding_window_mask.unsqueeze(0).unsqueeze(0), float("-inf"))
-
-    # Apply logit softcapping if enabled
-    attn_scores = _apply_logit_softcapping(attn_scores, logit_cap)
-
-    # Apply sinks if provided
-    if sinks is not None:
-        # Concatenate sinks to attention scores following the reference implementation
-        # sinks should have n_heads elements, each head gets its own sink value
-        # Expand sinks to [b, n_heads, s_q, 1] - one sink column per head
-        sinks_expanded = sinks.reshape(1, -1, 1, 1).expand(
-            b, n_heads, s_q, 1
-        )  # [b, n_heads, s_q, 1]
-
-        # Concatenate along the key dimension (last dimension)
-        attn_weights = torch.cat([attn_scores, sinks_expanded], dim=-1)
-        attn_weights = torch.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-
-        # Use only the non-sink portion for computing output
-        # We added exactly 1 column, so remove exactly 1 column
-        attn_out = torch.matmul(attn_weights[..., :-1], value_t)  # [b, n_heads, s_q, v_head_dim]
-    else:
-        attn_weights = torch.softmax(attn_scores, dim=-1, dtype=torch.float32).to(query.dtype)
-        attn_out = torch.matmul(attn_weights, value_t)  # [b, n_heads, s_q, v_head_dim]
+    """SDPA attention that can handle GQA."""
 
-    # Apply dropout if specified
-    if dropout_p > 0.0:
-        attn_out = F.dropout(attn_out, p=dropout_p, training=False)
-
-    # Return in bnsd format (same as input format)
-    return attn_out
+    return F.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+        enable_gqa=True,
+    )
 
 
 @grouped_sdpa.register_fake
@@ -173,19 +101,16 @@ def grouped_sdpa_fake(
     dropout_p=0.0,
     is_causal=False,
     scale=None,
-    sinks=None,
-    sliding_window=None,
-    logit_cap=None,
 ):
     """Fake implementation of grouped SDPA."""
     return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
 
 
 @torch.library.custom_op("auto_deploy::torch_attention_bsnd_grouped_sdpa", mutates_args=())
 def bsnd_grouped_sdpa(
-    query: torch.Tensor,  # layout: [b, s_q, n, d]
-    key: torch.Tensor,  # layout: [b, s_k, n, d]
-    value: torch.Tensor,  # layout: [b, s_k, n, d]
+    query: torch.Tensor,  # layout: [b, n, s_q, d]
+    key: torch.Tensor,  # layout: [b, n, s_k, d]
+    value: torch.Tensor,  # layout: [b, n, s_k, d]
     attn_mask: Optional[torch.Tensor] = None,  # layout: [b, n, s_q, s_k]
     dropout_p: float = 0.0,
     is_causal: bool = False,
@@ -199,16 +124,14 @@ def bsnd_grouped_sdpa(
     Note that attn_mask layout is still assumed to be [b, n, s_q, s_k] and is consistent with the
     original sdpa op!
     """
-    # Transpose inputs to bnsd format for grouped_sdpa
-    query = query.transpose(1, 2)  # [b, s_q, n, d] -> [b, n, s_q, d]
-    key = key.transpose(1, 2)  # [b, s_k, n, d] -> [b, n, s_k, d]
-    value = value.transpose(1, 2)  # [b, s_k, n, d] -> [b, n, s_k, d]
-
-    # Call grouped_sdpa with bnsd inputs
-    out = grouped_sdpa(
-        query, key, value, attn_mask, dropout_p, is_causal, scale, sinks, sliding_window, logit_cap
-    )
-    # Transpose back to bsnd format
+    # let's transpose to bnsd so we can use the grouped sdpa
+    query = query.transpose(1, 2).contiguous()
+    key = key.transpose(1, 2).contiguous()
+    value = value.transpose(1, 2).contiguous()
+
+    out = grouped_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
+
+    # let's transpose back to bnsd
     return out.transpose(1, 2).contiguous()