NVIDIA
diff --git a/‎megatron/core/inference/contexts/attention_context/mamba_metadata.py‎
Lines changed: 3 additions & 5 deletions b/‎megatron/core/inference/contexts/attention_context/mamba_metadata.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 11 additions & 11 deletions b/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎megatron/core/inference/engines/dynamic_engine.py‎
Lines changed: 11 additions & 1 deletion b/‎megatron/core/inference/engines/dynamic_engine.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎megatron/core/transformer/attention.py‎
Lines changed: 4 additions & 1 deletion b/‎megatron/core/transformer/attention.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎megatron/training/arguments.py‎
Lines changed: 6 additions & 2 deletions b/‎megatron/training/arguments.py‎
Lines changed: 6 additions & 2 deletions
@@ -154,7 +154,7 @@ def update(
                 active_mamba_indices[:real_decode_count]
             )
             if padded_decode_count > real_decode_count:
-                self._batch_indices_decode_buffer[real_decode_count:padded_decode_count].fill_(-1)
+                self._batch_indices_decode_buffer[real_decode_count:padded_decode_count] = -1
             self.batch_indices_decode = self._batch_indices_decode_buffer[:padded_decode_count]
 
         # Determine if we have a chunked prefill request and adjust counts for regular prefill
@@ -180,9 +180,7 @@ def update(
                 )
 
             if padded_prefill_count > regular_prefill_count:
-                self._batch_indices_prefill_buffer[
-                    regular_prefill_count:padded_prefill_count
-                ].fill_(-1)
+                self._batch_indices_prefill_buffer[regular_prefill_count:padded_prefill_count] = -1
 
             self.batch_indices_prefill = self._batch_indices_prefill_buffer[:padded_prefill_count]
 
@@ -199,7 +197,7 @@ def update(
                 )
 
             if padded_token_count > seq_len:
-                self._seq_idx_buffer[:, seq_len:padded_token_count].fill_(-1)
+                self._seq_idx_buffer[:, seq_len:padded_token_count] = -1
             self.seq_idx = self._seq_idx_buffer[:, :padded_token_count]
 
             # Update cu_seqlens
 
@@ -540,6 +540,7 @@ def __init__(
         self.use_cuda_graphs_for_non_decode_steps = use_cuda_graphs_for_non_decode_steps
         # Deal with chunked prefill
         self.chunked_prefill_request_id = -1
+        self.has_explicit_chunked_prefill_req = False
 
         # FlashInfer.
         if use_flashinfer_fused_rope is True:
@@ -1300,15 +1301,11 @@ def initialize_attention_state(
         if construct_graph_dimensions is not None:
             self.add_dummy_requests_for_cudagraph_capture(construct_graph_dimensions)
 
-        has_explicit_chunked_prefill_req = (
-            self.chunked_prefill_request_id != -1 and self.is_hybrid_model
-        )
-
         batch_dimensions = InferenceBatchDimensions(
             token_count=self.active_token_count,
             prefill_req_count=self.num_prefill_requests,
             decode_req_count=self.num_decode_requests,
-            has_explicit_chunked_prefill_req=has_explicit_chunked_prefill_req,
+            has_explicit_chunked_prefill_req=self.has_explicit_chunked_prefill_req,
         )
         self.batch_dimensions = batch_dimensions
         best_graph = CUDAGraphBatchDimensionBuilder.match_graph_config(
@@ -1342,7 +1339,7 @@ def initialize_attention_state(
                 token_count=padded_token_count,
                 prefill_req_count=padded_prefill_req_count,
                 decode_req_count=padded_decode_req_count,
-                has_explicit_chunked_prefill_req=has_explicit_chunked_prefill_req,
+                has_explicit_chunked_prefill_req=self.has_explicit_chunked_prefill_req,
             )
         self.padded_active_token_count = self.padded_batch_dimensions.token_count
         self.padded_active_request_count = self.padded_batch_dimensions.req_count
@@ -1373,6 +1370,8 @@ def initialize_attention_state(
 
         attn_dimensions = batch_dimensions
         if self.using_cuda_graph_this_step():
+            assert not self.has_explicit_chunked_prefill_req
+
             # Treat some decode requests as prefill requests to fit the cuda graph batch dimension.
             if batch_dimensions.decode_req_count > self.padded_batch_dimensions.decode_req_count:
                 total_req = batch_dimensions.req_count
@@ -1382,7 +1381,7 @@ def initialize_attention_state(
                     token_count=batch_dimensions.token_count,
                     prefill_req_count=adjusted_prefill_req_count,
                     decode_req_count=adjusted_decode_req_count,
-                    has_explicit_chunked_prefill_req=has_explicit_chunked_prefill_req,
+                    has_explicit_chunked_prefill_req=False,
                 )
 
         self.active_attn_metadata["mha_metadata"].update(
@@ -1461,6 +1460,7 @@ def reset(self) -> None:
 
         # Reset chunked prefill state
         self.chunked_prefill_request_id = -1
+        self.has_explicit_chunked_prefill_req = False
         self.num_prefill_requests = 0
         self._using_cuda_graph_this_step = False
         self.padded_batch_dimensions = InferenceBatchDimensions(
@@ -1981,6 +1981,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
             active_requests_mask[-1] = (
                 1  # must keep this, next iteration will add a new chunk to it
             )
+        self.has_explicit_chunked_prefill_req = False
 
         active_request_count = (active_requests_mask == 1).sum().item()
         finished_request_count = (active_requests_mask == 0).sum().item()
@@ -2011,7 +2012,6 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
 
             # Reset Mamba state.
             self.reset_mamba_state()
-
             return
 
         # 3. Concatenate the paused tokens to the active tokens if present.
@@ -2070,9 +2070,9 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
 
             if self.chunked_prefill_request_id != -1:
                 # find the id in request_ids that is the chunked_prefill_request_id. Only one request should be chunked.
-                active_requests_requiring_new_block[self.get_index_of_chunked_prefill_request()] = (
-                    0  # chunked prefill should not be paused
-                )
+                active_requests_requiring_new_block[
+                    self.get_index_of_chunked_prefill_request() - self.paused_request_count
+                ] = 0  # chunked prefill should not be paused
 
             active_requests_requiring_new_block_count = (
                 (active_requests_requiring_new_block == 1).sum().item()
 
@@ -1081,6 +1081,12 @@ def schedule_chunked_prefill(self):
 
             if request_can_be_added and kv_cache_available:
                 if token_fully_can_be_added:
+                    # For Mamba models we need to ensure that the last prefill chunk
+                    # is still tagged as a chunked prefill request.
+                    self.context.has_explicit_chunked_prefill_req = (
+                        self.context.is_hybrid_model
+                        and self.context.chunked_prefill_request_id == req.request_id
+                    )
                     self.context.chunked_prefill_request_id = -1
                     self.context.add_request(req)
                     self._loop.call_soon_threadsafe(
@@ -1091,14 +1097,18 @@ def schedule_chunked_prefill(self):
                     # Fully scheduled, so we remove from waiting pool
                     self.waiting_request_ids.popleft()
                     # Only this case we keep checking the rest of the waiting queue
-                    can_schedule = True
+                    # We break early for Mamba models running a final prefill chunk
+                    # so that no additional requests are scheduled beyond the chunked
+                    # prefill request.
+                    can_schedule = not self.context.has_explicit_chunked_prefill_req
                 elif token_partially_can_be_added:
                     chunk_length = self.context.max_tokens - self.context.active_token_count
                     self.context.add_request(req, chunk_length=chunk_length)
                     self._loop.call_soon_threadsafe(
                         self._loop.create_task, self._notify_cond_for_new_request()
                     )
                     self.context.chunked_prefill_request_id = req.request_id
+                    self.context.has_explicit_chunked_prefill_req = self.context.is_hybrid_model
                     req.remaining_prompt_tokens = req.remaining_prompt_tokens[chunk_length:]
                     req.finished_chunk_token_count += chunk_length
                     # Still have tokens to prefill, so we break and keep the
 
@@ -658,6 +658,7 @@ def flash_decode_and_prefill(
         cu_seqlens_k,
         seqlens_k,
         block_table,
+        is_decode_only,
     ) -> Tensor:
         """Flash attention kernel for mixed decode and prefill samples.
 
@@ -671,6 +672,7 @@ def flash_decode_and_prefill(
             cu_seqlens_k (Tensor): Cumulative key sequence lengths.
             seqlens_k (Tensor): key sequence lengths.
             block_table (Tensor): KV cache block ids for all samples.
+            is_decode_only (bool): True if batch is decode only.
         Return:
             (Tensor) Attention output.
         """
@@ -679,7 +681,7 @@ def flash_decode_and_prefill(
         assert block_table is not None
 
         # Flash attn kernel.
-        if max_seqlen_q > 1:
+        if not is_decode_only:
             q = q.squeeze(1)
             if getattr(self, "softmax_scale", None) is not None:
                 softmax_scale = self.softmax_scale
@@ -1065,6 +1067,7 @@ def forward(
                     cu_kv_lengths,
                     kv_lengths,
                     block_table,
+                    inference_context.is_decode_only(),
                 )
                 core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)')
 
 
@@ -1686,8 +1686,12 @@ def _add_inference_args(parser):
     group.add_argument('--mlp-chunks-for-prefill', type=int, default=1,
                        help='Number of chunks along sequence dimension for MLP '
                        'computation during prefill')
-    group.add_argument('--disable-chunked-prefill', default=False, action="store_true",
-                       help='Disable chunked prefill (chunked prefill is enabled by default).')
+    # TODO(ksanthanam): Clean this up in future PR
+    group.add_argument('--enable-chunked-prefill', dest='disable_chunked_prefill',
+                       action='store_false', default=True,
+                       help="Enable chunked prefill (disabled by default)")
+    group.add_argument('--disable-chunked-prefill', dest='disable_chunked_prefill',
+                       action='store_true', help=argparse.SUPPRESS)
     group.add_argument('--inference-dynamic-batching-cuda-graph-max-tokens',
                        type=int, default=16384,
                        help='Maximum number of tokens to capture in a cuda graph.')