Store logit output in static tensor

tdene · tdene · commit be86bed107bc · 2026-03-19T16:56:02.000-05:00
diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
@@ -109,6 +109,10 @@ def _init_dynamic_sampling_tensors(self):
         """Initialize tensors needed for dynamic sampling."""
         context = self.inference_wrapped_model.inference_context
         max_requests = context.max_requests
+        if context.materialize_only_last_token_logits:
+            max_logits = max_requests
+        else:
+            max_logits = context.max_tokens
 
         # Callback to get request IDs that should be marked as finished due to stop words
         self._get_stop_word_finished_ids_callback = None
@@ -117,6 +121,15 @@ def _init_dynamic_sampling_tensors(self):
         logits_dtype = self.inference_wrapped_model.config.params_dtype
 
         self._sampling_backend = "torch"
+        self._enable_cuda_graph = False
+
+        # Initialize bookkeeping tensors.
+        if self._enable_cuda_graph:
+            self._all_logits_cuda = torch.empty(
+                (1, max_logits, self.vocab_size), dtype=logits_dtype, device=device
+            )
+        else:
+            self._all_logits_cuda = None
         self._sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device)
         # Speculative tokens tensor will be allocated later when num_speculative_tokens is set by the engine
         self._accepted_tokens_per_request = None
@@ -596,7 +609,7 @@ def _dynamic_step_context_init(
         else:
             return context.current_input_and_position_ids()
 
-    def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) -> Tensor:
+    def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor):
         """Forward step the model to get logits for dynamic batching.
 
         This also handles logits-broadcasting for pipeline parallelism.
@@ -607,6 +620,11 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor)
         """
         context = self.inference_wrapped_model.inference_context
         active_request_count = context.total_request_count - context.paused_request_count
+        logits_seq_len = (
+            active_request_count
+            if context.materialize_only_last_token_logits
+            else context.padded_active_token_count
+        )
 
         with torch.inference_mode():
             logits = self.inference_wrapped_model.run_one_forward_step(
@@ -619,6 +637,12 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor)
         # will be computed serially after verification to ensure they are
         # conditioned on verified tokens only.
 
+        assert logits_seq_len == (
+            active_request_count
+            if context.materialize_only_last_token_logits
+            else input_ids.shape[1]
+        )
+
         if self.model_is_pipeline_parallel:
             if context.config.materialize_only_last_token_logits:
                 logits_seq_len = active_request_count
@@ -636,7 +660,11 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor)
                 pp_group=self.pp_group,
             )
 
-        return logits
+        # Copy logits to contiguous buffer.
+        if self._enable_cuda_graph:
+            self._all_logits_cuda[:, :logits_seq_len, :].copy_(logits)
+        else:
+            self._all_logits_cuda = logits
 
     def _dynamic_step_sample_bookkeeping(self):
         """Perform bookkeeping necessary to sample logits for dynamic batching."""
@@ -1053,7 +1081,7 @@ def _verify_speculative_tokens(
 
         return last_one_indices, accepted_tokens_mask, input_tokens_required
 
-    def _dynamic_step_sample_logits_and_verify_tokens(self, logits: Tensor, input_ids: Tensor):
+    def _dynamic_step_sample_logits_and_verify_tokens(self, input_ids: Tensor):
         """
         Sample tokens from logits for dynamic batching with speculative tokens and verify the tokens.
         """
@@ -1069,6 +1097,7 @@ def _dynamic_step_sample_logits_and_verify_tokens(self, logits: Tensor, input_id
         num_decode_requests = active_request_count - num_prefill_requests
 
         # Get the logit indices for tokens that need sampling.
+        logits = self._all_logits_cuda
         required_logit_indices = self._get_required_logit_indices(
             request_in_prefill_status_tensor,
             request_query_lengths,
@@ -1132,24 +1161,22 @@ def _dynamic_step_sample_logits_and_verify_tokens(self, logits: Tensor, input_id
             dim=1
         )
 
-    def _dynamic_step_sample_logits(self, logits: Tensor):
-        """Sample tokens from logits for dynamic batching.
-
-        Args:
-            logits (Tensor): The logits from the forward pass.
-        """
+    def _dynamic_step_sample_logits(self):
+        """Sample tokens from logits for dynamic batching."""
         # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank
         # and then broadcast the sampled tokens rather than broadcasting the raw logits.
 
         # Last token logits.
         context = self.inference_wrapped_model.inference_context
+        active_request_count = context.total_request_count - context.paused_request_count
+
         if context.config.materialize_only_last_token_logits:
             # When materialize_only_last_token_logits is true, last_token_logits is
             # already called in the forward pass of GPT.
-            required_token_logits = logits.squeeze(0)
+            required_token_logits = self._all_logits_cuda.squeeze(0)[:active_request_count, :]
         else:
             # todo : Should do verification here and get approrpiate las token logits
-            required_token_logits = context.last_token_logits(logits)
+            required_token_logits = context.last_token_logits(self._all_logits_cuda)
 
         if self._sampling_backend == "torch":
             # Concatenate the outputs once to prevent repeated small writes.
@@ -1247,19 +1274,24 @@ def _router_record_bookkeeping(self) -> Optional[Dict[int, Tensor]]:
 
         return routing_indices_per_request
 
-    def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]:
+    def _dynamic_step_calculate_log_probs(self) -> Optional[Tensor]:
         """Calculate log probs from logits."""
         context = self.inference_wrapped_model.inference_context
         active_request_count = context.total_request_count - context.paused_request_count
+        logits_seq_len = (
+            active_request_count
+            if context.materialize_only_last_token_logits
+            else context.padded_active_token_count
+        )
 
         return context.calculate_log_probs(
-            logits,
+            self._all_logits_cuda[:, :logits_seq_len, :],
             self._sampled_tokens_cuda[:active_request_count],
             only_last_token_logits=context.config.materialize_only_last_token_logits,
         )
 
     def _dynamic_step_calculate_log_probs_speculative(
-        self, logits: Tensor
+        self,
     ) -> Tuple[List[List[float]], Tensor]:
         """Calculate log probs from logits for speculative decoding.
 
@@ -1271,9 +1303,6 @@ def _dynamic_step_calculate_log_probs_speculative(
         - log_prob(accepted_token[j]) comes from logits at position j
         - log_prob(newly_sampled_token) comes from logits at position accepted_count
 
-        Args:
-            logits (Tensor): The main model logits [1, seq_len, vocab_size].
-
         Returns:
             Tuple of (log_probs_list, log_probs_tensor):
                 log_probs_list: List of lists, one per active request, containing
@@ -1291,7 +1320,7 @@ def _dynamic_step_calculate_log_probs_speculative(
         num_prefill_requests = request_in_prefill_status_tensor.sum().item()
         num_decode_requests = active_request_count - num_prefill_requests
 
-        logits_squeezed = logits.squeeze(0).float()
+        logits_squeezed = self._all_logits_cuda.squeeze(0).float()
         log_probs_tensor = F.log_softmax(logits_squeezed[: context.active_token_count], dim=-1)
 
         log_probs_list_decode = []
@@ -1449,12 +1478,11 @@ def _dynamic_step_calculate_top_n_logprobs_speculative(
         return top_n_results if top_n_results else None
 
     def _dynamic_step_calculate_top_n_logprobs(
-        self, logits: Tensor, log_probs_tensor: Optional[Tensor] = None
+        self, log_probs_tensor: Optional[Tensor] = None
     ) -> Optional[Dict[int, List[Tuple[Tensor, Tensor]]]]:
         """Calculate top-n log probs from logits for dynamic batching.
 
         Args:
-            logits (Tensor): The logits to compute top-n log probs from.
             log_probs_tensor (Optional[Tensor]): Pre-computed log probabilities tensor.
                 If provided, avoids recomputing log_softmax. Should be the tensor
                 returned by calculate_log_probs.
@@ -1743,7 +1771,7 @@ async def async_generate_output_tokens_dynamic_batch(
 
         # Forward pass produces only base logits. When speculative decoding is
         # active, MTP logits are computed serially after verification.
-        logits = self._dynamic_step_forward_logits(input_ids, position_ids)
+        self._dynamic_step_forward_logits(input_ids, position_ids)
 
         # Commit Mamba intermediate states before update_requests, which
         # may swap request indices. The Python lists tracking EOS block IDs
@@ -1769,7 +1797,7 @@ async def async_generate_output_tokens_dynamic_batch(
 
         if self.num_speculative_tokens > 0:
             # Phase 1: Verify speculative tokens using base logits only.
-            self._dynamic_step_sample_logits_and_verify_tokens(logits, input_ids)
+            self._dynamic_step_sample_logits_and_verify_tokens(input_ids)
             # Phase 2: Rewind KV cache for rejected tokens.
             self._rewind_kv_cache()
 
@@ -1781,25 +1809,21 @@ async def async_generate_output_tokens_dynamic_batch(
             # Phase 3: Compute MTP serially with correct (verified) inputs.
             self._compute_serial_mtp_and_sample()
         else:
-            self._dynamic_step_sample_logits(logits)
+            self._dynamic_step_sample_logits()
 
         log_probs = None
         top_n_logprobs = None
         if return_log_probs or return_top_n_logprobs:
             if self.num_speculative_tokens > 0:
-                log_probs, log_probs_tensor = self._dynamic_step_calculate_log_probs_speculative(
-                    logits
-                )
+                log_probs, log_probs_tensor = self._dynamic_step_calculate_log_probs_speculative()
                 if return_top_n_logprobs:
                     top_n_logprobs = self._dynamic_step_calculate_top_n_logprobs_speculative(
                         log_probs_tensor
                     )
             else:
-                log_probs, log_probs_tensor = self._dynamic_step_calculate_log_probs(logits)
+                log_probs, log_probs_tensor = self._dynamic_step_calculate_log_probs()
                 if return_top_n_logprobs:
-                    top_n_logprobs = self._dynamic_step_calculate_top_n_logprobs(
-                        logits, log_probs_tensor
-                    )
+                    top_n_logprobs = self._dynamic_step_calculate_top_n_logprobs(log_probs_tensor)
 
         if skip_bookkeeping:
             request_bookkeeping = {}