nv-auto-deploy
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/_triton_attention_internal.py‎
Lines changed: 8 additions & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/_triton_attention_internal.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py‎
Lines changed: 28 additions & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py‎
Lines changed: 56 additions & 9 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py‎
Lines changed: 56 additions & 9 deletions
@@ -100,6 +100,8 @@ def _paged_generate_mha(
         n_heads,
         d_head,
         SEQ_BLOCK_SIZE,
+        False,
+        None,
     )
 
 
@@ -338,6 +340,7 @@ def _generate_mha_rope_fusion(
         d_head,
         SEQ_BLOCK_SIZE,
         HEAD_BLOCK_SIZE,
+        -1,
     )
     attention_kv_stage2[(b, n_heads, 1)](
         stage1_output_values,
@@ -348,6 +351,8 @@ def _generate_mha_rope_fusion(
         n_heads,
         d_head,
         SEQ_BLOCK_SIZE,
+        False,
+        None,
     )
 
 
@@ -414,7 +419,9 @@ def _flattened_context_mha_rope_fusion(
         d_head,
         SEQ_BLOCK,
         max_cache_seq_len,
-        num_stages=2,
+        -1,
+        False,
+        None,
     )
 
 
 
@@ -41,6 +41,8 @@ def _generate_mha(
     input_pos: torch.Tensor,
     scale: float,
     out: torch.Tensor,
+    sinks: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
 ):
     b, (n_heads, q_d_head) = q.shape[0], q.shape[-2:]
     max_seq_len, n_kv_heads = k_cache.shape[1:3]
@@ -97,7 +99,10 @@ def _generate_mha(
         v_d_head,
         SEQ_BLOCK_SIZE,
         HEAD_BLOCK_SIZE,
+        sliding_window if sliding_window is not None else -1,
     )
+    has_sinks = sinks is not None
+
     attention_kv_stage2[(b, n_heads, 1)](
         stage1_output_values,
         stage1_output_logsumexp,
@@ -107,6 +112,8 @@ def _generate_mha(
         n_heads,
         v_d_head,
         SEQ_BLOCK_SIZE,
+        has_sinks,
+        sinks,
     )
 
 
@@ -122,6 +129,8 @@ def _flattened_context_mha(
     seq_start: torch.Tensor,
     scale: float,
     out: torch.Tensor,
+    sinks: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
 ) -> None:
     # NOTE: s_total == sum(seq_len)
     s_total, n_heads, q_d_head = q.shape
@@ -149,6 +158,8 @@ def _flattened_context_mha(
 
     # TODO: use input_pos to get the correct cache locations
     grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    has_sinks = sinks is not None
+
     context_attention_kv_flattened[grid](
         q,
         seq_len,
@@ -165,7 +176,9 @@ def _flattened_context_mha(
         v_d_head,
         SEQ_BLOCK,
         max_cache_seq_len,
-        num_stages=2,
+        sliding_window if sliding_window is not None else -1,
+        has_sinks,
+        sinks,
     )
 
 
@@ -187,6 +200,8 @@ def flattened_mha_with_cache(
     # <none>
     # CONSTANTS
     scale: Optional[float],
+    sinks: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
 ) -> torch.Tensor:
     """Flattened MHA with cache that takes q, k, v in BSND layout.
 
@@ -223,7 +238,9 @@ def flattened_mha_with_cache(
     y = q.new_empty(*bs_view, num_heads, v_head_dim).contiguous()
     if s == 1:
         # generate-only phase
-        _generate_mha(q, k, v, k_cache, v_cache, cache_loc, input_pos, scale, y)
+        _generate_mha(
+            q, k, v, k_cache, v_cache, cache_loc, input_pos, scale, y, sinks, sliding_window
+        )
     else:
         # mixed context + generate phase
         _flattened_context_mha(
@@ -238,6 +255,8 @@ def flattened_mha_with_cache(
             seq_start,
             scale,
             y,
+            sinks,
+            sliding_window,
         )
 
     return y.view(*output_shape)
@@ -255,6 +274,8 @@ def flattened_mha_fake(
     k_cache: torch.Tensor,
     v_cache: torch.Tensor,
     scale: Optional[float],
+    sinks: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
 ):
     return q.new_empty(*q.shape[:-1], v.shape[-1]).contiguous()
 
@@ -388,7 +409,11 @@ def get_constants(cls, source_attn_node: Node) -> List[Constant]:
         if not isinstance(scale, float):
             ad_logger.warning("Provided scale is not a float, Using default scale instead.")
             scale = None
-
+        # Get sinks and sliding_window from args or kwargs
+        sinks = extract_op_args(source_attn_node, "sinks")[0]
+        sliding_window = extract_op_args(source_attn_node, "sliding_window")[0]
         return [
             scale,  # softmax scale
+            sinks,
+            sliding_window,
         ]
@@ -112,6 +112,7 @@ def gqa_attention_kv_stage1(
     V_D_HEAD: tl.constexpr,  # Dimension of each key/value head
     SEQ_BLOCK_SIZE: tl.constexpr,  # Block size used for tiling the sequence dim.
     HEAD_BLOCK_SIZE: tl.constexpr,  # pad to 16 if HEAD_RATIO is < 16 to invoke tensor cores.
+    SLIDING_WINDOW: tl.constexpr,
 ):
     """Attention kernel to be used for generate-only batches.
 
@@ -122,7 +123,7 @@ def gqa_attention_kv_stage1(
     Supports non-power-of-2 D_HEAD
 
     Uses flash decoding.
-    KV-cache layout is assumed to be [Batch,Seq, Head, Dim]
+    KV-cache layout is assumed to be [Batch, Seq, Head, Dim]
     1. Fetch the K-cache from 0 to input_pos
     2. Fetch the V-cache from 0 to input_pos
     3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
@@ -145,10 +146,20 @@ def gqa_attention_kv_stage1(
 
     # The number of Q heads that map to each KV head.
     HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS  # This needs to be a power-of-2
-    if seq_start_pos > kv_position:
-        return
-    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
-    seq_mask = seq_offsets <= kv_position
+
+    # Apply sliding window constraints
+    if SLIDING_WINDOW > 0:
+        # For sliding window, limit the sequence range
+        sliding_start = tl.maximum(0, kv_position - SLIDING_WINDOW + 1)
+        if seq_start_pos + SEQ_BLOCK_SIZE <= sliding_start or seq_start_pos > kv_position:
+            return
+        seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+        seq_mask = (seq_offsets <= kv_position) & (seq_offsets >= sliding_start)
+    else:
+        if seq_start_pos > kv_position:
+            return
+        seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+        seq_mask = seq_offsets <= kv_position
 
     # Need to pad the head dim to 16 if HEAD_RATIO is < 16 so that tensor cores can be invoked
     #
@@ -358,6 +369,8 @@ def attention_kv_stage2(
     N_HEADS: tl.constexpr,
     D_HEAD: tl.constexpr,
     SEQ_BLOCK_SIZE: tl.constexpr,  # Nearest power of 2 for num_blocks
+    HAS_SINKS: tl.constexpr,
+    sinks_ptr,
 ):
     # There are batch * N_HEADS programs
     batch_id = tl.program_id(axis=0)
@@ -382,6 +395,11 @@ def attention_kv_stage2(
     sumexp = tl.exp(logsumexp - max_logsumexp)  # [NUM_BLOCKS_POW2]
 
     aggregate_sumexp = tl.sum(sumexp, axis=0)
+    # Add sinks contribution to the softmax denominator
+    if HAS_SINKS:
+        sinks_val = tl.load(sinks_ptr + batch_id * N_HEADS + head_id)
+        sinks_exp = tl.exp(sinks_val - max_logsumexp)
+        aggregate_sumexp += sinks_exp
 
     values_offsets = block_offsets[:, None] * D_HEAD + dhead_offsets[None, :]
     values_mask = block_mask[:, None] * dhead_mask[None, :]
@@ -573,6 +591,9 @@ def context_attention_kv_flattened(
     V_D_HEAD: tl.constexpr,  # Dimension of each value head.
     SEQ_BLOCK: tl.constexpr,
     MAX_SEQ_LENGTH: tl.constexpr,
+    SLIDING_WINDOW: tl.constexpr,  # Sliding window size, -1 means no sliding window
+    HAS_SINKS: tl.constexpr,
+    sinks_ptr,
 ):
     """Kernel for context phase.
 
@@ -623,7 +644,15 @@ def context_attention_kv_flattened(
     # input_pos_ptr stores the location at which kv must be written back for the given batch.
     kv_position = tl.load(input_pos_ptr + batch_id)
     num_blocks = (kv_position + seq_len + SEQ_BLOCK - 1) // SEQ_BLOCK
-    for s in range(0, num_blocks + 1, 1):
+    start = 0
+    if SLIDING_WINDOW > 0:
+        # Use the LAST query in this block for more conservative start calculation
+        last_q_pos = (
+            (seq_block_id + 1) * SEQ_BLOCK - 1 + kv_position
+        )  # Last query's absolute position
+        earliest_kv_pos = max(0, last_q_pos - SLIDING_WINDOW + 1)
+        start = max(0, earliest_kv_pos // SEQ_BLOCK)
+    for s in range(start, num_blocks + 1):
         kv_seq_offsets = s * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
         kv_seq_mask = kv_seq_offsets < (kv_position + seq_len)
 
@@ -637,9 +666,17 @@ def context_attention_kv_flattened(
         )
         qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
         qk += tl.dot(q, k.trans())
-        qk = tl.where(
-            (seq_offsets[:, None] + kv_position) >= kv_seq_offsets[None, :], qk, float("-inf")
-        )
+        # Apply causal mask
+        causal_mask = (seq_offsets[:, None] + kv_position) >= kv_seq_offsets[None, :]
+        # Apply sliding window mask if enabled
+        if SLIDING_WINDOW > 0:
+            sliding_window_mask = kv_seq_offsets[None, :] >= (
+                seq_offsets[:, None] + kv_position - SLIDING_WINDOW + 1
+            )
+            combined_mask = sliding_window_mask & causal_mask
+        else:
+            combined_mask = causal_mask
+        qk = tl.where(combined_mask, qk, float("-inf"))
         qk *= SCALE
         # rowmax
         m_ij = tl.maximum(tl.max(qk, 1), lse_i)
@@ -662,6 +699,16 @@ def context_attention_kv_flattened(
         l_i_new = tl.exp(lse_i - m_ij) + l_ij
         lse_i = m_ij + tl.log(l_i_new)
 
+    # Add sinks contribution to the final softmax calculation
+    if HAS_SINKS:
+        sinks_val = tl.load(sinks_ptr + batch_id * N_HEADS + head_id)
+        m_sinks = tl.maximum(m_i, sinks_val)
+        acc_scale = tl.exp(m_i - m_sinks)
+        acc = acc * acc_scale[:, None]
+        l_sinks = tl.exp(lse_i - m_sinks) + tl.exp(sinks_val - m_sinks)
+        lse_i = m_sinks + tl.log(l_sinks)
+        m_i = m_sinks
+
     o_scale = tl.exp(m_i - lse_i)
 
     acc = acc * o_scale[:, None]