[TRTLLM-9686][feat] Fix issues with processed logprobs functionality.

stnie · stnie · commit b04e7e68d8bd · 2025-12-12T14:11:24.000Z
- Expand test_logits_logprobs to perform a check for processed logprobs
- Fix processed logprobs for greedy sampling and when using temperature

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -1127,7 +1127,7 @@ def _convert_logprobs_tensor_to_list(
                     assert beam_idx == 0, (
                         "beam search does not need to explicitly handle sampled log probs"
                     )
-                    if sampled_log_probs_indices[step_idx] not in logprobs:
+                    if sampled_log_probs_indices[step_idx].item() not in logprobs:
                         logprobs[sampled_log_probs_indices[step_idx].item()] = Logprob(
                             logprob=sampled_log_probs_vals[step_idx].item(),
                             rank=max(
@@ -1380,7 +1380,7 @@ def _process_draft_tokens_rejection_sampling(
             else _request_strategy(request, vocab_size=2**31)
         )
         generator = self.get_generator(request.py_draft_logits.device)
-        _, draft_probs = sample(
+        _, draft_probs, _ = sample(
             draft_sampling_strategy,
             request.py_draft_logits,
             generator=generator,
@@ -2160,7 +2160,7 @@ def _sample_batched_by_strategy(
                 for _ in range(steps)
             ]
 
-            group_next_tokens_cuda, group_softmax_cuda = (
+            group_next_tokens_cuda, group_softmax_cuda, group_temperature_cuda = (
                 self._grouped_sampler_cls.sample_grouped_strategies(
                     strategy_key,
                     group_strategies_per_step,
@@ -2182,18 +2182,29 @@ def _sample_batched_by_strategy(
             ].copy_(group_next_tokens_cuda, non_blocking=True)
 
             if return_log_probs:
+                # select the logits for the current group
+                current_group_logits_cuda = (
+                    group_logits_cuda
+                    if logit_indices_for_sampler is None
+                    else group_logits_cuda[logit_indices_for_sampler]
+                )
                 if need_processed_logprobs:
                     # if softmax is 0, then the logit was masked out => set to -inf
-                    group_tgt_logits_cuda = torch.where(
-                        group_softmax_cuda != 0, group_logits_cuda, float("-inf")
-                    )
+                    # apply masking to the logits and store in batch_logits_cuda
                     batch_logits_cuda[
                         batch_next_tokens_offset_start:batch_next_tokens_offset_end
-                    ].copy_(group_tgt_logits_cuda, non_blocking=True)
+                    ] = torch.where(
+                        group_softmax_cuda > 0, current_group_logits_cuda, float("-inf")
+                    )
+                    # apply temperature to the logits
+                    if group_temperature_cuda is not None:
+                        batch_logits_cuda[
+                            batch_next_tokens_offset_start:batch_next_tokens_offset_end
+                        ] /= group_temperature_cuda
                 else:
                     batch_logits_cuda[
                         batch_next_tokens_offset_start:batch_next_tokens_offset_end
-                    ].copy_(group_logits_cuda, non_blocking=True)
+                    ].copy_(current_group_logits_cuda, non_blocking=True)
 
             # Set LlmRequest.py_target_probs
             if speculation_needs_probs:
@@ -2697,7 +2708,7 @@ def _process_logprobs(
         # NB: we do not need group logprobs anymore, we can reuse the storage
         # We only provide 0 based rank, it will be corrected to 1-indexed in handle logprobs
         group_logprobs_cuda.greater_(sampled_vals_cuda)
-        sampled_rank_cuda = group_logprobs_cuda.sum(dim=-1)
+        sampled_rank_cuda = group_logprobs_cuda.sum(dim=-1).to(torch.int32)
 
         # Use a single D2H copy to reduce overheads
         topk_vals = torch.empty_like(topk_vals_cuda, device="cpu", pin_memory=False)
@@ -2768,12 +2779,7 @@ def _process_requests(
             req_offsets=sampling_requests_metadata.req_offsets,
         )
 
-        self._handle_log_probs(
-            requests,
-            logits_cuda,
-            logits_cuda_indexer=logits_cuda_indexer,
-            req_num_generated_tokens=sampling_requests_metadata.req_num_generated_tokens,
-        )
+        return_log_probs = self._return_log_probs(requests)
 
         # Perform sampling in batches
         batched_sampling_result = self._sample_batched_by_strategy(
@@ -2792,7 +2798,9 @@ def _process_requests(
         )
 
         if return_log_probs:
-            self._process_logprobs(batched_sampling_result, requests, req_num_steps)
+            self._process_logprobs(
+                batched_sampling_result, requests, sampling_requests_metadata.req_num_steps
+            )
 
         # Fill results into output buffers
         new_tokens_host = self._unbatch_sampling_results(
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py
@@ -266,7 +266,8 @@ def greedy_search_sampling_batch(
     next_tokens = torch.argmax(logits, dim=-1)
     softmax: Optional[torch.Tensor] = None
     if return_probs:
-        softmax = torch.softmax(logits, dim=-1)
+        softmax = torch.zeros_like(logits)
+        softmax.scatter_(1, next_tokens.unsqueeze(-1), 1.0)
     return next_tokens, softmax
 
 
@@ -474,7 +475,7 @@ def sample(
     generator: Optional[torch.Generator] = None,
     group_metadata: StrategyMetadata | None = None,
     return_probs: bool = True,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[float]]:
     match strategy:
         case ("top_k", top_k, temperature):
             tokens, softmax = top_k_sampling_batch(
@@ -506,6 +507,7 @@ def sample(
             )
         case ("greedy", None):
             tokens, softmax = greedy_search_sampling_batch(logits, return_probs=return_probs)
+            temperature = None
         case ("beam_search", beam_width_in, beam_width_out, temperature):
             assert group_metadata is not None and isinstance(group_metadata, BeamSearchMetadata), (
                 "BeamSearchMetadata is required for beam_search_sampling_batch"
@@ -519,7 +521,7 @@ def sample(
                 generator=generator,
                 return_probs=return_probs,
             )
-    return tokens, softmax
+    return tokens, softmax, temperature
 
 
 GenericStrategyKeyType = TypeVar("GenericStrategyKeyType")
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py
@@ -141,8 +141,9 @@ def _sample_greedy_with_probs(
             *,
             group_logit_indices: Optional[torch.Tensor],
         ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-            probs = self._prepare_probs_with_temperature(logits, group_logit_indices, None)
-            new_tokens, _ = greedy_search_sampling_batch(probs, return_probs=False)
+            if group_logit_indices is not None:
+                logits = torch.index_select(logits, 0, group_logit_indices)  # ensures copy
+            new_tokens, probs = greedy_search_sampling_batch(logits, return_probs=True)
             return new_tokens, probs
 
         @classmethod
@@ -240,6 +241,9 @@ def computes_probs(cls) -> bool:
             return True
 
     class GreedyWithProbs(StrategyImplWithProbs):
+        def __init__(self):
+            self._temperature = None
+
         @override
         @classmethod
         def from_strategies(
@@ -425,6 +429,9 @@ def computes_probs(cls) -> bool:
             return False
 
     class GreedySampleOnly(StrategyImplSampleOnly):
+        def __init__(self):
+            self._temperature = None
+
         @override
         @classmethod
         def from_strategies(
@@ -722,7 +729,7 @@ def sample_grouped_strategies(
         generator: Optional[torch.Generator] = None,
         return_probs: bool,
         group_metadata: StrategyMetadata | None = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         if hasattr(group_key, "static_beam_width_in"):
             beam_width_in = group_key.static_beam_width_in
         else:
@@ -735,9 +742,16 @@ def sample_grouped_strategies(
         assert return_probs == group_key.computes_probs()
 
         strategy_impl_cls = group_key
-        return strategy_impl_cls.from_strategies(strategies, cuda_device=logits.device).sample(
+        sampling_object = strategy_impl_cls.from_strategies(strategies, cuda_device=logits.device)
+        next_tokens, softmax = sampling_object.sample(
             logits,
             group_logit_indices=group_logit_indices,
             generator=generator,
             group_metadata=group_metadata,
         )
+        temperature = (
+            sampling_object._temperature.unsqueeze(-1)
+            if sampling_object._temperature is not None
+            else None
+        )
+        return next_tokens, softmax, temperature
diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py
@@ -337,7 +337,9 @@ def _validate(self):
         if self.guided_decoding is not None:
             self.guided_decoding._validate()
 
-        # correct types as users might pass in logprob=True for Top-1 logprobs
+        # correct types as users might pass in logprob=True for Top-1 logprobs and logprobs=False for no logprobs
+        if self.logprobs is False:
+            self.logprobs = None
         self.logprobs = self.logprobs and int(self.logprobs)
         self.prompt_logprobs = self.prompt_logprobs and int(self.prompt_logprobs)
 
diff --git a/tests/unittest/_torch/sampler/test_logits_logprobs.py b/tests/unittest/_torch/sampler/test_logits_logprobs.py