NVIDIA
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 6 additions & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 143 additions & 66 deletions b/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 143 additions & 66 deletions
diff --git a/‎tensorrt_llm/executor/base_worker.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/executor/base_worker.py‎
Lines changed: 1 addition & 0 deletions
@@ -460,6 +460,7 @@ def __init__(
             is_first_draft: bool = False,
             use_chunked_generation_logits: bool = True,
             logits_chunk_size: int = 8,
+            logprobs_mode: str = "raw",
             **kwargs):
 
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
@@ -538,6 +539,8 @@ def __init__(
         # currently, keep py_stop_words_list as python list, rather than tensor.
         self.py_stop_words_list = stop_words_list
 
+        self.py_logprobs_mode = logprobs_mode
+
         self.py_result = PyResult(
             prompt_len=self.py_prompt_len,
             max_new_tokens=self.py_max_new_tokens,
@@ -797,7 +800,9 @@ def executor_request_to_llm_request(
         arrival_time=getattr(executor_request, "py_arrival_time", None),
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None),
-        kv_cache_retention_config=executor_request.kv_cache_retention_config)
+        kv_cache_retention_config=executor_request.kv_cache_retention_config,
+        logprobs_mode=getattr(executor_request, "py_logprobs_mode", "raw"),
+    )
     if child_req_ids:
         for child_id in child_req_ids:
             llm_request.create_child_request(child_id)
 
@@ -202,12 +202,13 @@ class SampleStateWithMMResult:
 class RequestGroupKey(Generic[GenericStrategyKeyType]):
     strategy: GenericStrategyKeyType
     speculation_needs_probs: bool
+    need_processed_logprobs: bool
 
     def __iter__(self):
-        return iter((self.strategy, self.speculation_needs_probs))
+        return iter((self.strategy, self.speculation_needs_probs, self.need_processed_logprobs))
 
     def __len__(self):
-        return 2
+        return 3
 
 
 class RequestGroupValue(NamedTuple):
@@ -338,13 +339,19 @@ def _group_requests_by_strategy_key(
             #     process_draft_tokens.
             TorchSampler._speculation_could_use_rejection_sampling(req, strategy)
         )
-        strategy_key = strategy_to_key(strategy, speculation_needs_probs)
-        group_dict_entry = group_dict[(strategy_key, speculation_needs_probs)]
+        need_processed_logprobs = req.py_logprobs_mode == "processed"
+        need_probs = speculation_needs_probs or need_processed_logprobs
+        strategy_key = strategy_to_key(strategy, need_probs)
+        group_dict_entry = group_dict[
+            (strategy_key, speculation_needs_probs, need_processed_logprobs)
+        ]
         group_dict_entry[0].append(req_index)
         group_dict_entry[1].append(strategy)
     return {
         RequestGroupKey(
-            strategy=group_key[0], speculation_needs_probs=group_key[1]
+            strategy=group_key[0],
+            speculation_needs_probs=group_key[1],
+            need_processed_logprobs=group_key[2],
         ): RequestGroupValue(
             indices=torch.tensor(indices, pin_memory=pin_memory, dtype=torch.int32),
             strategies=strategies,
@@ -374,6 +381,8 @@ class _BatchedSamplingResult:
     batch_req_indices: torch.Tensor
     # Next tokens for all requests:
     batch_next_tokens_cuda_int: torch.Tensor
+    # Logits for all requests:
+    batch_logits_cuda: torch.Tensor | None = None
 
 
 # Helper class for _PackedStepIndexer and _UnpackedStepIndexer, facilitating the
@@ -942,34 +951,55 @@ def _convert_logprobs_tensor_to_list(
         self,
         token_tensor: torch.Tensor,
         logprobs_tensor: torch.Tensor,
+        sampled_log_probs_indices: torch.Tensor | None,
+        sampled_log_probs_vals: torch.Tensor | None,
+        sampled_log_probs_rank: torch.Tensor | None,
     ) -> list[list[dict[int, Logprob]]]:
         """Convert the logprobs tensor to a list of lists of dictionaries of Logprob objects
 
         Logprobs storage expects logprobs as a list[list[dict[int, Logprob]]] object
 
         args:
+            token_tensor: torch.Tensor. Shape: beam_width, num_tokens, num_logprobs
             logprobs_tensor: torch.Tensor. Shape: beam_width, num_tokens, num_logprobs
+            sampled_log_probs_indices: torch.Tensor | None. Shape: num_tokens
+            sampled_log_probs_vals: torch.Tensor | None. Shape: num_tokens
+            sampled_log_probs_rank: torch.Tensor | None. Shape: num_tokens
         output:
             list[list[dict[int, Logprob]]]. Shape: beam_width, num_tokens, dict with num_logprobs keys
         """
         assert token_tensor.dim() == 3 and logprobs_tensor.dim() == 3, (
             f"Token and logprobs tensors must have 3 dimensions (beam_width, num_tokens, num_logprobs). \
             Got shapes (token_tensor) {token_tensor.shape} and (logprobs_tensor) {logprobs_tensor.shape} instead"
         )
-        return [
-            [
-                {
+
+        token_log_probs: list[list[dict[int, Logprob]]] = []
+        for beam_idx in range(token_tensor.shape[0]):
+            beam_token_log_probs: list[dict[int, Logprob]] = []
+            for step_idx, (topk_token, topk_logprob) in enumerate(
+                zip(token_tensor[beam_idx], logprobs_tensor[beam_idx])
+            ):
+                logprobs = {
                     token: Logprob(logprob=logprob, rank=rank + 1)
                     for rank, (token, logprob) in enumerate(
                         zip(topk_token.tolist(), topk_logprob.tolist())
                     )
                 }
-                for topk_token, topk_logprob in zip(
-                    token_tensor[beam_idx], logprobs_tensor[beam_idx]
-                )
-            ]
-            for beam_idx in range(token_tensor.shape[0])
-        ]
+                if sampled_log_probs_indices is not None:
+                    assert beam_idx == 0, (
+                        "beam search does not need to explicitly handle sampled log probs"
+                    )
+                    if sampled_log_probs_indices[step_idx] not in logprobs:
+                        logprobs[sampled_log_probs_indices[step_idx].item()] = Logprob(
+                            logprob=sampled_log_probs_vals[step_idx].item(),
+                            rank=max(
+                                token_tensor.shape[2] + 1, sampled_log_probs_rank[step_idx].item()
+                            ),
+                        )
+                beam_token_log_probs.append(logprobs)
+            token_log_probs.append(beam_token_log_probs)
+
+        return token_log_probs
 
     def handle_logprobs(
         self,
@@ -986,6 +1016,10 @@ def handle_logprobs(
                 topk_log_probs_indices = self.store.new_tokens[0, request.py_seq_slot].view(
                     beam_width, count, -1
                 )
+                sampled_log_probs_vals = None
+                sampled_log_probs_indices = None
+                # correct the rank to be 1-indexed
+                sampled_log_probs_rank = None
             else:
                 assert beam_width == 1, "beam width must be 1 for non-beam search"
                 topk_log_probs_vals = request.py_topk_logprobs_vals[: count * beam_width].view(
@@ -994,9 +1028,17 @@ def handle_logprobs(
                 topk_log_probs_indices = request.py_topk_logprobs_indices[
                     : count * beam_width
                 ].view(beam_width, count, -1)
+                sampled_log_probs_vals = request.py_sampled_logprobs_vals[:count]
+                sampled_log_probs_indices = request.py_sampled_logprobs_indices[:count]
+                # correct the rank to be 1-indexed
+                sampled_log_probs_rank = request.py_sampled_logprobs_rank[:count] + 1
 
             token_log_probs = self._convert_logprobs_tensor_to_list(
-                topk_log_probs_indices, topk_log_probs_vals
+                topk_log_probs_indices,
+                topk_log_probs_vals,
+                sampled_log_probs_indices,
+                sampled_log_probs_vals,
+                sampled_log_probs_rank,
             )
             request.py_result.append_log_probs(token_log_probs)
 
@@ -1865,6 +1907,7 @@ def _sample_batched_by_strategy(
         seq_slots: torch.Tensor,
         seq_lens: Optional[torch.Tensor] = None,
         token_dtype: torch.dtype,
+        return_log_probs: bool,
     ) -> _BatchedSamplingResult:
         grouped_requests = _group_requests_by_strategy_key(
             requests,
@@ -1894,9 +1937,16 @@ def _sample_batched_by_strategy(
         batch_next_tokens_cuda_int = torch.empty(
             (logits_cuda.size(0), self.max_beam_width), device=cuda_device, dtype=token_dtype
         )
+        batch_logits_cuda = (
+            torch.empty(
+                (logits_cuda.size(0), logits_cuda.size(1)), device=cuda_device, dtype=torch.float32
+            )
+            if return_log_probs
+            else None
+        )
         batch_req_idx_offset_start = 0
         batch_next_tokens_offset_start = 0
-        for (strategy_key, speculation_needs_probs), (
+        for (strategy_key, speculation_needs_probs, need_processed_logprobs), (
             group_req_indices,
             group_strategies,
             group_metadata,
@@ -1943,7 +1993,7 @@ def _sample_batched_by_strategy(
                     group_strategies_per_step,
                     group_logits_cuda,
                     generator=generator_cuda,
-                    return_probs=speculation_needs_probs,
+                    return_probs=speculation_needs_probs or need_processed_logprobs,
                     group_logit_indices=logit_indices_for_sampler,
                     group_metadata=group_metadata,
                 )
@@ -1958,6 +2008,20 @@ def _sample_batched_by_strategy(
                 batch_next_tokens_offset_start:batch_next_tokens_offset_end
             ].copy_(group_next_tokens_cuda, non_blocking=True)
 
+            if return_log_probs:
+                if need_processed_logprobs:
+                    # if softmax is 0, then the logit was masked out => set to -inf
+                    group_tgt_logits_cuda = torch.where(
+                        group_softmax_cuda != 0, group_logits_cuda, float("-inf")
+                    )
+                    batch_logits_cuda[
+                        batch_next_tokens_offset_start:batch_next_tokens_offset_end
+                    ].copy_(group_tgt_logits_cuda, non_blocking=True)
+                else:
+                    batch_logits_cuda[
+                        batch_next_tokens_offset_start:batch_next_tokens_offset_end
+                    ].copy_(group_logits_cuda, non_blocking=True)
+
             # Set LlmRequest.py_target_probs
             if speculation_needs_probs:
                 assert group_softmax_cuda is not None
@@ -1986,6 +2050,7 @@ def _sample_batched_by_strategy(
         return _BatchedSamplingResult(
             batch_req_indices=batch_req_indices,
             batch_next_tokens_cuda_int=batch_next_tokens_cuda_int,
+            batch_logits_cuda=batch_logits_cuda,
         )
 
     def _unbatch_sampling_results(
@@ -2385,6 +2450,63 @@ def request_stop_words(request: LlmRequest, new_tokens: torch.Tensor):
                     per_step[step, request_idx, beam_idx] = True
         return per_step
 
+    @nvtx_range("_process_logprobs")
+    def _process_logprobs(
+        self,
+        batched_sampling_result: _BatchedSamplingResult,
+        requests: list[LlmRequest],
+        req_num_steps: torch.Tensor,
+    ):
+        group_logprobs_cuda = F.log_softmax(batched_sampling_result.batch_logits_cuda, dim=-1)
+        all_req_indices = batched_sampling_result.batch_req_indices
+        group_next_tokens_cuda = batched_sampling_result.batch_next_tokens_cuda_int
+        group_req_indices = [
+            req_gid.item()
+            for req_gid in all_req_indices
+            if requests[req_gid].py_num_logprobs is not None
+        ]
+        topk_vals_cuda, topk_indices_cuda = torch.topk(
+            group_logprobs_cuda,
+            k=max(requests[req_id].py_num_logprobs for req_id in group_req_indices),
+            dim=-1,
+        )
+
+        sampled_vals_cuda = torch.gather(
+            group_logprobs_cuda, dim=-1, index=group_next_tokens_cuda.view(-1, 1)
+        )
+        sampled_indices_cuda = group_next_tokens_cuda
+
+        # NB: we do not need group logprobs anymore, we can reuse the storage
+        # We only provide 0 based rank, it will be corrected to 1-indexed in handle logprobs
+        group_logprobs_cuda.greater_(sampled_vals_cuda)
+        sampled_rank_cuda = group_logprobs_cuda.sum(dim=-1)
+
+        # Use a single D2H copy to reduce overheads
+        topk_vals = torch.empty_like(topk_vals_cuda, device="cpu", pin_memory=False)
+        topk_indices = torch.empty_like(topk_indices_cuda, device="cpu", pin_memory=False)
+        sampled_vals = torch.empty_like(sampled_vals_cuda, device="cpu", pin_memory=False)
+        sampled_indices = torch.empty_like(sampled_indices_cuda, device="cpu", pin_memory=False)
+        sampled_rank = torch.empty_like(sampled_rank_cuda, device="cpu", pin_memory=False)
+
+        topk_vals.copy_(topk_vals_cuda, non_blocking=True)
+        topk_indices.copy_(topk_indices_cuda, non_blocking=True)
+        sampled_vals.copy_(sampled_vals_cuda, non_blocking=True)
+        sampled_indices.copy_(sampled_indices_cuda, non_blocking=True)
+        sampled_rank.copy_(sampled_rank_cuda, non_blocking=True)
+        current_offset = 0
+        for req_id, steps in zip(group_req_indices, req_num_steps[group_req_indices].tolist()):
+            req = requests[req_id]
+            next_offset = current_offset + steps
+            # NB: Assigning views on memory which is being filled asynchronously
+            req.py_topk_logprobs_vals = topk_vals[current_offset:next_offset, : req.py_num_logprobs]
+            req.py_sampled_logprobs_vals = sampled_vals[current_offset:next_offset]
+            req.py_topk_logprobs_indices = topk_indices[
+                current_offset:next_offset, : req.py_num_logprobs
+            ]
+            req.py_sampled_logprobs_indices = sampled_indices[current_offset:next_offset]
+            req.py_sampled_logprobs_rank = sampled_rank[current_offset:next_offset]
+            current_offset = next_offset
+
     @nvtx_range("_process_requests")
     def _process_requests(
         self,
@@ -2454,55 +2576,6 @@ def _process_requests(
             req_offsets=req_offsets,
         )
 
-        # Handle top-k logprobs. This is done outside the sampling loop,
-        # because the returned logprobs are specified to not reflect temperature scaling,
-        # top-k/top-p masking, etc.
-        if return_log_probs:
-            assert logits_cuda.dim() == 2, "logits should be 2D"
-
-            logprobs_req_indices = [
-                req_id for req_id, req in enumerate(requests) if req.py_num_logprobs
-            ]
-            logprobs_logit_indices = logits_cuda_indexer[logprobs_req_indices]
-            logprobs_logit_indices_cuda = logprobs_logit_indices.to(
-                device=logits_cuda.device, non_blocking=True
-            )
-            logprobs_cuda = F.log_softmax(
-                logits_cuda[logprobs_logit_indices_cuda].to(dtype=torch.float32, non_blocking=True),
-                dim=-1,
-            )
-            topk_vals_cuda, topk_indices_cuda = torch.topk(
-                logprobs_cuda, k=max(req.py_num_logprobs for req in requests), dim=-1
-            )
-            # Use a single D2H copy to reduce overheads
-            topk_vals = torch.empty_like(topk_vals_cuda, device="cpu", pin_memory=True)
-            topk_indices = torch.empty_like(topk_indices_cuda, device="cpu", pin_memory=True)
-            topk_vals.copy_(topk_vals_cuda, non_blocking=True)
-            topk_indices.copy_(topk_indices_cuda, non_blocking=True)
-            current_offset = 0
-            for req_id, steps in zip(
-                logprobs_req_indices, req_num_generated_tokens[logprobs_req_indices].tolist()
-            ):
-                req = requests[req_id]
-                next_offset = current_offset + steps
-                # NB: Assigning views on memory which is being filled asynchronously
-                req.py_topk_logprobs_vals = topk_vals[
-                    current_offset:next_offset, : req.py_num_logprobs
-                ]
-                req.py_topk_logprobs_indices = topk_indices[
-                    current_offset:next_offset, : req.py_num_logprobs
-                ]
-
-                # context requests do not have multiple input beams, but they need multiple output beams
-                if req.is_context_init_state:
-                    req.py_topk_logprobs_vals = req.py_topk_logprobs_vals.expand(
-                        req.sampling_config.beam_width, -1
-                    )
-                    req.py_topk_logprobs_indices = req.py_topk_logprobs_indices.expand(
-                        req.sampling_config.beam_width, -1
-                    )
-                current_offset = next_offset
-
         # Perform sampling in batches
         batched_sampling_result = self._sample_batched_by_strategy(
             logits_cuda,
@@ -2515,8 +2588,12 @@ def _process_requests(
             seq_lens=seq_lens,
             req_num_generated_tokens=req_num_generated_tokens,
             token_dtype=new_tokens_cuda.dtype,
+            return_log_probs=return_log_probs,
         )
 
+        if return_log_probs:
+            self._process_logprobs(batched_sampling_result, requests, req_num_steps)
+
         # Fill results into output buffers
         new_tokens_host = self._unbatch_sampling_results(
             batched_sampling_result,
 
@@ -561,6 +561,7 @@ def _deduce_max_tokens(request: GenerationRequest,
                 cache_salt_id=request.cache_salt_id)
             executor_request.py_num_logprobs = request.sampling_params.logprobs
             executor_request.py_lora_path = py_lora_path
+            executor_request.py_logprobs_mode = request.sampling_params.logprobs_mode
 
             if self._is_pytorch_backend and request.multimodal_params is not None:
                 if request.multimodal_params.multimodal_data is not None: