[TRTLLM-9689][feat] Introduce max_topk_logprobs parameter and enhance logprobs handling

stnie · stnie · commit d66c983bf48e · 2025-12-12T16:55:20.000Z
- Added max_topk_logprobs parameter to AutoDeployConfig and LlmRequest to control the number of top-k logprobs storable for each token.
- Updated TorchSampler to accommodate max_topk_logprobs in logprobs processing and storage.
- Enhanced logprobs handling in the sampling process to support both sampled and top-k logprobs.
- Enabled batched processing of logprobs to enhance logprobs performance
- Modified tests to validate the new max_topk_logprobs functionality and ensure correct logprobs output.

Signed-off-by: Stefan &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -200,6 +200,9 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
     max_num_tokens: Optional[int] = Field(default=None, description="The maximum number of tokens.")
     max_seq_len: int = Field(default=512, ge=1, description="The maximum sequence length.")
     max_batch_size: int = Field(default=8, ge=1, description="The maximum batch size.")
+    max_topk_logprobs: int = Field(
+        default=0, description="The maximum number of top-k logprobs to store for each token."
+    )
     attn_page_size: int = Field(
         default=64,
         ge=1,
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -770,6 +770,7 @@ def instantiate_sampler(
             max_num_sequences=max_num_sequences,
             max_beam_width=ad_config.max_beam_width,
             disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
+            max_topk_logprobs=ad_config.max_topk_logprobs,
         )
         sampler = TorchSampler(sampler_args)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -857,6 +857,7 @@ def create_torch_sampler_args(
     disable_overlap_scheduler: bool,
     disable_flashinfer_sampling: bool,
     enable_async_worker: bool,
+    max_topk_logprobs: int,
 ):
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
@@ -872,7 +873,9 @@ def create_torch_sampler_args(
         max_beam_width=max_beam_width,
         disable_flashinfer_sampling=disable_flashinfer_sampling,
         disable_overlap_scheduler=disable_overlap_scheduler,
-        enable_async_worker=enable_async_worker)
+        enable_async_worker=enable_async_worker,
+        max_topk_logprobs=max_topk_logprobs,
+    )
 
 
 def instantiate_sampler(
@@ -888,6 +891,7 @@ def instantiate_sampler(
     decoding_config: trtllm.DecodingConfig,
     kv_cache_config: KvCacheConfig,
     disable_flashinfer_sampling: bool,
+    max_topk_logprobs: int,
 ):
     enable_async_worker = (confidential_compute_enabled()
                            or llm_args.sampler_force_async_worker)
@@ -901,6 +905,7 @@ def instantiate_sampler(
         disable_overlap_scheduler=llm_args.disable_overlap_scheduler,
         disable_flashinfer_sampling=disable_flashinfer_sampling,
         enable_async_worker=enable_async_worker,
+        max_topk_logprobs=max_topk_logprobs,
     )
     decoding_mode = get_decoding_mode(decoding_config=decoding_config,
                                       max_beam_width=max_beam_width)
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -534,6 +534,7 @@ def drafting_loop_wrapper(model):
             decoding_config=decoding_config,
             kv_cache_config=kv_cache_config,
             disable_flashinfer_sampling=llm_args.disable_flashinfer_sampling,
+            max_topk_logprobs=llm_args.max_topk_logprobs,
         )
         logger.info(f"Using Sampler: {type(sampler).__name__}")
 
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2848,6 +2848,13 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
+    max_topk_logprobs: int = Field(
+        default=0,
+        description=
+        "The maximum number of top-k logprobs per request to calculate each step. This does not affect the number of sampled logprobs.",
+        status="prototype",
+    )
+
     @property
     def quant_config(self) -> QuantConfig:
         if self._quant_config is None:
diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py
@@ -344,10 +344,11 @@ def _validate(self):
         if self.guided_decoding is not None:
             self.guided_decoding._validate()
 
-        # correct types as users might pass in logprob=True for Top-1 logprobs and logprobs=False for no logprobs
+        # correct types as users might pass in logprob=True for Top-0 logprobs and logprobs=False for no logprobs
         if self.logprobs is False:
             self.logprobs = None
-        self.logprobs = self.logprobs and int(self.logprobs)
+        if self.logprobs is True:
+            self.logprobs = 0
         self.prompt_logprobs = self.prompt_logprobs and int(self.prompt_logprobs)
 
     # NB: Static, because downstream code only holds instances of
diff --git a/tests/unittest/_torch/sampler/test_beam_search.py b/tests/unittest/_torch/sampler/test_beam_search.py
@@ -125,7 +125,7 @@ def check_generation_logits(beam: CompletionOutput,
 def check_logprobs(beam: CompletionOutput, sampling_params: SamplingParams,
                    valid_tokens: int | None) -> None:
     """Check if the logprobs have the correct shape"""
-    if sampling_params.logprobs:
+    if sampling_params.logprobs is not None:
         generated_tokens = valid_tokens if valid_tokens is not None else sampling_params.max_tokens
         assert len(
             beam.logprobs
@@ -345,7 +345,7 @@ class GeneralTestParams:
     prompt_len = len(input_tokens)
     num_generated_tokens = 5
     seq_len = prompt_len + num_generated_tokens
-    num_logprobs = 1
+    num_logprobs = 0
     seq_slot = 4
     end_id = 99
     batch_size = 2
@@ -541,7 +541,7 @@ def create_default_request(test_params: GeneralTestParams) -> LlmRequest:
                       end_id=test_params.end_id,
                       sampling_config=SamplingConfig(
                           sampling_params._get_sampling_config()),
-                      return_log_probs=test_params.num_logprobs > 0,
+                      return_log_probs=test_params.num_logprobs >= 0,
                       num_logprobs=test_params.num_logprobs,
                       is_streaming=False)
 
@@ -590,7 +590,7 @@ def test_create_beam_history():
     num_generated_tokens = test_params.num_generated_tokens
     seq_slot = test_params.seq_slot
     vocab_size = test_params.vocab_size
-    num_logprobs = test_params.num_logprobs
+    num_logprobs = test_params.num_logprobs + 1
     cache_indirection = sampler.store.cache_indirection
     original_tokens = sampler.store.original_tokens
     original_logprobs = torch.zeros(
@@ -663,7 +663,8 @@ def test_create_beam_history():
     # set the new log probs and tokens for the beam search sampling
     sampler.store.new_log_probs[
         seq_slot, :beam_width] = original_logprobs[:beam_width,
-                                                   num_generated_tokens - 1, 0]
+                                                   num_generated_tokens - 1,
+                                                   0:1]
     sampler.store.new_tokens[
         0,
         seq_slot, :beam_width] = original_logprob_indices[:beam_width,
diff --git a/tests/unittest/_torch/sampler/test_logits_logprobs.py b/tests/unittest/_torch/sampler/test_logits_logprobs.py
@@ -115,6 +115,7 @@ def simple_llm(request) -> LLM:
                            "TinyLlama-1.1B-Chat-v1.0"),
         max_batch_size=8,
         disable_flashinfer_sampling=disable_flashinfer_sampling,
+        max_topk_logprobs=3,
     )
     return llm
 
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -223,6 +223,10 @@ methods:
         annotation: Optional[Dict[str, str]]
         default: null
         status: prototype
+      max_topk_logprobs:
+        annotation: int
+        default: 0
+        status: prototype
     return_annotation: None
   generate:
     parameters:

Original file line number	Diff line number	Diff line change
`@@ -770,6 +770,7 @@ def instantiate_sampler(`
`770`	`770`	`max_num_sequences=max_num_sequences,`
`771`	`771`	`max_beam_width=ad_config.max_beam_width,`
`772`	`772`	`disable_overlap_scheduler=ad_config.disable_overlap_scheduler,`
	`773`	`+ max_topk_logprobs=ad_config.max_topk_logprobs,`
`773`	`774`	`)`
`774`	`775`	`sampler = TorchSampler(sampler_args)`
`775`	`776`
Original file line number	Diff line number	Diff line change
`@@ -534,6 +534,7 @@ def drafting_loop_wrapper(model):`
`534`	`534`	`decoding_config=decoding_config,`
`535`	`535`	`kv_cache_config=kv_cache_config,`
`536`	`536`	`disable_flashinfer_sampling=llm_args.disable_flashinfer_sampling,`
	`537`	`+ max_topk_logprobs=llm_args.max_topk_logprobs,`
`537`	`538`	`)`
`538`	`539`	`logger.info(f"Using Sampler: {type(sampler).__name__}")`
`539`	`540`
Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@ def simple_llm(request) -> LLM:`
`115`	`115`	`"TinyLlama-1.1B-Chat-v1.0"),`
`116`	`116`	`max_batch_size=8,`
`117`	`117`	`disable_flashinfer_sampling=disable_flashinfer_sampling,`
	`118`	`+ max_topk_logprobs=3,`
`118`	`119`	`)`
`119`	`120`	`return llm`
`120`	`121`