[None][feat] Return topk logprobs in torch backend (#7976)

dcaox · web-flow · commit 62010c0ab70a · 2025-09-30T09:32:37.000+08:00
Signed-off-by: Cao Dong &lt;87467313+dcaox@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -311,6 +311,7 @@ def __init__(
             is_draft: bool = False,
             seq_slot: Optional[int] = None,
             target_seq_slot: Optional[int] = None,
+            num_logprobs: int = 0,
             is_first_draft: bool = False,
             **kwargs):
 
@@ -356,6 +357,7 @@ def __init__(
             tensorrt_llm.bindings.internal.runtime.
             TaskLayerModuleConfig] | None = None
 
+        self.py_num_logprobs = num_logprobs
         self.py_return_log_probs = return_log_probs
         self.py_return_context_logits = return_context_logits
         self.py_return_generation_logits = return_generation_logits
@@ -565,6 +567,8 @@ def executor_request_to_llm_request(
         mrope_position_deltas=mrope_position_deltas,
         lookahead_config=None,
         return_log_probs=executor_request.output_config.return_log_probs,
+        num_logprobs=executor_request.py_num_logprobs if hasattr(
+            executor_request, "py_num_logprobs") else 0,
         return_context_logits=executor_request.output_config.
         return_context_logits,
         return_perf_metrics=executor_request.output_config.return_perf_metrics,
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -9,6 +9,7 @@
 from typing import Any, List, Literal, Optional, cast
 
 import torch
+import torch.nn.functional as F
 
 from tensorrt_llm._torch.pyexecutor.make_decoding_batch_input_output import \
     MakeDecodingBatchInputOutput
@@ -891,13 +892,16 @@ def handle_logprobs(self, request: LlmRequest, state: SampleState, *,
                         beam: int, count: int):
         current_slice = slice(0, count), request.py_seq_slot, beam
         if request.py_return_log_probs:
-            assert state.host.log_probs is not None
-            log_probs = state.host.log_probs[request.py_seq_slot][beam][:count]
-            current_tokens = state.host.new_tokens[current_slice]
+            topk_log_probs_vals = request.py_topk_logprobs_vals[:count]
+            topk_log_probs_indices = request.py_topk_logprobs_indices[:count]
 
             token_log_probs = [{
-                int(token): Logprob(logprob=logprob, rank=1)
-            } for token, logprob in zip(current_tokens, log_probs.tolist())]
+                token: Logprob(logprob=logprob, rank=rank + 1)
+                for rank, (token, logprob) in enumerate(
+                    zip(topk_token.tolist(), topk_logprob.tolist()))
+            }
+                               for topk_token, topk_logprob in zip(
+                                   topk_log_probs_indices, topk_log_probs_vals)]
             assert beam == 0, "The following call relies on beam_width to be 1 - hence the list with a single element"
             request.py_result.append_log_probs([token_log_probs])
 
@@ -1162,13 +1166,8 @@ def log_probs_host(
             self,
             scheduled_requests: ScheduledRequests) -> Optional[torch.Tensor]:
         """Shape: In lockstep with TRTLLMSampler: https://github.com/NVIDIA/TensorRT-LLM/blob/cea5dd1e3883b18bf50901a7f196f50a9544c28c/cpp/include/tensorrt_llm/runtime/decoderState.h#L103"""
-        if any(req.py_return_log_probs
-               for req in scheduled_requests.all_requests()):
-            return torch.empty(
-                (self.max_num_sequences, self.MAX_BEAM_WIDTH, self.max_tokens),
-                device="cpu",
-                pin_memory=True)
-        return None
+        return any(req.py_return_log_probs
+                   for req in scheduled_requests.all_requests())
 
     @override
     @torch.inference_mode()
@@ -1198,8 +1197,7 @@ def sample_async(
         sampler_event.record()
         return SampleState(scheduled_requests=scheduled_requests,
                            device=SampleStateTensors(new_tokens=new_tokens),
-                           host=SampleStateTensors(new_tokens=new_tokens_host,
-                                                   log_probs=log_probs_host),
+                           host=SampleStateTensors(new_tokens=new_tokens_host),
                            sampler_event=sampler_event)
 
     @staticmethod
@@ -1308,12 +1306,22 @@ def _sample_batched_by_strategy(
         model_outputs: dict[str, torch.Tensor],
         *,
         cuda_device: torch.device,
-        log_probs_host: torch.Tensor | None = None,
+        log_probs_host: bool = False,
         req_num_steps: torch.Tensor,
         req_offsets: torch.Tensor,
         steps_dim_size: int,
         token_dtype: torch.dtype,
     ) -> _BatchedSamplingResult:
+        if log_probs_host:
+            assert logits_cuda.dim() == 2, "logits should be 2D"
+            logprobs = F.log_softmax(logits_cuda.to("cuda",
+                                                    dtype=torch.float32),
+                                     dim=-1)
+            topk_vals, topk_indices = torch.topk(logprobs,
+                                                 k=max(req.py_num_logprobs
+                                                       for req in requests),
+                                                 dim=-1)
+
         requests_by_strategy = _group_requests_by_sampling_strategy(
             requests, pin_memory=True)
         generator_cuda = self.get_generator(cuda_device)
@@ -1357,12 +1365,20 @@ def _sample_batched_by_strategy(
             #   softmax_grp_indices: Indices of 'speculation_group_indices' entries requesting probs
             #   speculation_softmax_indices: Indices of 'softmax_grp_indices' entries corresponding
             #                                to requests with draft logits.
-            if log_probs_host is not None:
+            if log_probs_host:
                 softmax_req_indices = group_req_indices
                 softmax_grp_indices = torch.arange(len(group_req_indices),
                                                    dtype=torch.int32)
                 speculation_softmax_indices = torch.tensor(
                     speculation_group_indices, dtype=torch.int32)
+                for req_id in group_req_indices:
+                    req = requests[req_id]
+                    req.py_topk_logprobs_vals = topk_vals[
+                        logits_cuda_indexer[req_id], :req.py_num_logprobs].to(
+                            device="cpu", non_blocking=True)
+                    req.py_topk_logprobs_indices = topk_indices[
+                        logits_cuda_indexer[req_id], :req.py_num_logprobs].to(
+                            device="cpu", non_blocking=True)
             else:
                 speculation_group_indices_tensor = torch.tensor(
                     speculation_group_indices, dtype=torch.int32)
@@ -1462,7 +1478,7 @@ def _unbatch_sampling_results(
         new_tokens_cuda: torch.Tensor,
         req_num_steps: torch.Tensor,
         seq_slots: torch.Tensor,
-        log_probs_host: torch.Tensor | None = None,
+        log_probs_host: bool = False,
     ) -> torch.Tensor:
         beam = self.BEAM
         assert beam == 0, "beam_width != 1 not supported"
@@ -1479,17 +1495,6 @@ def _dims_canonically_ordered(t: torch.Tensor) -> bool:
         # Assert destination tensor dimensions are canonically ordered ("row"-major); this
         # matters for element ordering in the .view(...).scatter_(...) calls below.
         assert _dims_canonically_ordered(new_tokens_cuda)
-        assert log_probs_host is None or _dims_canonically_ordered(
-            log_probs_host)
-
-        #     new_tokens_cuda indexed by
-        #         slice(0, steps), slot, beam
-        #     log_probs_host indexed by
-        #         slot, beam, slice(0, steps)
-        #     batch_... tensors indexed by slice(batch_req_index, batch_req_index + steps)
-        #
-        if log_probs_host is not None:
-            assert new_tokens_cuda.size(0) == log_probs_host.size(-2)
 
         # Construct index mapping from slice indices of computed tensors
         # (packed request_idx and step dimensions) to linearized indices
@@ -1511,39 +1516,6 @@ def _dims_canonically_ordered(t: torch.Tensor) -> bool:
                                  0, batch_dest_indices_1d_cuda,
                                  batch_next_tokens_cuda_int)
         new_tokens_host = new_tokens_cuda.to("cpu", non_blocking=True)
-        # NB: In order to avoid a scatter_ on the host and the necessary D2H copy + synchronization,
-        #     the 'step' and 'seq_slot' dimensions are unpacked on GPU and later asynchronously
-        #     copied into the destination buffer. Note that this overwrites all 'step' and token slots for the
-        #     requests in 'requests' (passed to _process_requests). In fact, the current implementation
-        #     even overwrites the destination tensors completely (including slices corresponding to request
-        #     slots not present in 'requests', cf. 'FIXME' below).
-        if log_probs_host is not None:
-            # FIXME: If log_probs_host were indexed by request indices, rather than request slots, this
-            #        tensor could be packed densely along the request axis.
-            log_probs_cuda = torch.empty_like(
-                log_probs_host, device=batch_dest_indices_1d_cuda.device)
-            # FIXME: Needs a separate indexer because tensor layout differs from new_tokens_cuda
-            batch_dest_probs_cuda_indexer = _UnpackedStepIndexer(
-                seq_slots=seq_slots[batch_req_indices],
-                num_steps=req_num_steps[batch_req_indices],
-                steps_dim_size=new_tokens_cuda.size(0),
-                slots_dim_size=new_tokens_cuda.size(1),
-                dim_order=_UnpackedStepIndexer.DimOrder.SLOT_MAJOR,
-                index_dtype=torch.int64,  # enforced by Tensor.scatter_
-            )
-            batch_dest_probs_indices_cuda = batch_dest_probs_cuda_indexer[:].to(
-                batch_softmax_cuda.device, non_blocking=True)
-            # NB: torch.arange is needed to enable "advanced indexing",
-            #   cf. https://numpy.org/devdocs/user/basics.indexing.html#integer-array-indexing
-            batch_token_probs = batch_softmax_cuda[
-                torch.arange(batch_softmax_cuda.size(0),
-                             device=batch_softmax_cuda.device,
-                             dtype=torch.int32), batch_next_tokens_cuda_int]
-            log_probs_cuda[:, beam,
-                           ...].view(-1, *log_probs_cuda.shape[3:]).scatter_(
-                               0, batch_dest_probs_indices_cuda,
-                               torch.log(batch_token_probs))
-            log_probs_host.copy_(log_probs_cuda, non_blocking=True)
         # For requests with LlmRequest.py_draft_logits, return py_target_probs
         for request, batch_softmax_index_cuda in py_draft_logits_indices:
             request.py_target_probs = batch_softmax_cuda[
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
@@ -480,6 +480,7 @@ def _deduce_max_tokens(request: GenerationRequest,
                 context_phase_params=context_phase_params,
                 type=request_type,
                 cache_salt_id=request.cache_salt_id)
+            executor_request.py_num_logprobs = request.sampling_params.logprobs
             executor_request.py_lora_path = py_lora_path
 
             if self._is_pytorch_backend and request.multimodal_params is not None:
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -598,10 +598,6 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                          is_gen_only: bool) -> None:
 
         if self.args.backend in ["pytorch", "_autodeploy"]:
-            if sampling_params.logprobs and sampling_params.logprobs > 1:
-                raise ValueError(
-                    f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
-                )
             # Check prompt length and query length against max_num_tokens to filter illegal requests.
             # Skip check for gen-only requests
             if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only:
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
@@ -180,7 +180,8 @@ def convert_task_params(self, task: GenerationTask):
             temperature=task.temperature,
             top_p=task.top_p,
             top_k=task.top_k,
-            return_context_logits=task.return_context_logits)
+            return_context_logits=task.return_context_logits,
+            logprobs=task.num_logprobs)
         return sampling_params
 
     async def generation_handler(self, task: GenerationTask) -> TaskStatus:
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
@@ -1862,6 +1862,18 @@ def llm_return_logprobs_test_harness(prompt_logprobs: Optional[int],
                 logprobs_result[0].keys()) in {logprobs, logprobs + 1}
             # Most contain log prob of the sample token, even if it's not within K
             assert token_ids[0] in logprobs_result[0].keys()
+            for step_logprobs in logprobs_result:
+                assert len(step_logprobs) == logprobs
+                logprob_items = [(logprob_obj.logprob, logprob_obj.rank)
+                                 for logprob_obj in step_logprobs.values()]
+                sorted_by_rank = sorted(logprob_items, key=lambda x: x[1])
+
+                for i in range(logprobs - 1):
+                    current_logprob, current_rank = sorted_by_rank[i]
+                    next_logprob, next_rank = sorted_by_rank[i + 1]
+                    assert current_logprob >= next_logprob
+                    assert current_rank == i + 1
+                    assert next_rank == current_rank + 1
             print("logprobs[0]: ", logprobs_result[0])
 
     if streaming:
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -915,6 +915,7 @@ def test_llm_return_logprobs(prompt_logprobs: Optional[int],
         (2, None, True, False),  # prompt_logprobs with context_logits
         (2, None, False, False),  # prompt_logprobs only
         (2, 1, False, False),  # both prompt and generation logprobs
+        (2, 3, False, False),  # both prompt and generation logprobs
     ])
 def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
                                        return_context_logits,