[https://nvbugs/5708810][fix] Fix TRTLLMSampler (#9710)

moraxu · web-flow · commit e6187d8109bc · 2025-12-15T23:26:52.000+01:00
Signed-off-by: Michal Guzek &lt;mguzek@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -3135,7 +3135,11 @@ def update_requests_single_beam_single_step(self, state: SampleStateTRTLLM):
                             )
                         }
                     ]
-                    cum_log_probs = [cum_log_probs_host[seq_slot]]
+                    cum_log_probs = [
+                        cum_log_probs_host[seq_slot][0]
+                        if isinstance(cum_log_probs_host[seq_slot], list)
+                        else cum_log_probs_host[seq_slot]
+                    ]
                     request.py_result.append_log_probs([log_probs], cum_log_probs)
                     idx += 1
 
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -319,7 +319,14 @@ def _handle_sequence(self,
         if response_tensors.request_perf_metrics is not None:
             output.request_perf_metrics = response_tensors.request_perf_metrics
 
-        if self._done:
+        # Check if this specific sequence is finished (not just if the entire request is done)
+        # This is important for best_of > n sampling where sequences finish at different times
+        sequence_is_finished = (finish_reasons and finish_reasons[src_idx]
+                                != tllm.FinishReason.NOT_FINISHED
+                                and finish_reasons[src_idx]
+                                != tllm.FinishReason.CANCELLED) or self._done
+
+        if sequence_is_finished:
             if finish_reasons[src_idx] == tllm.FinishReason.END_ID:
                 output.finish_reason = 'stop'
             elif finish_reasons[src_idx] == tllm.FinishReason.STOP_WORDS:
@@ -344,6 +351,9 @@ def _handle_sequence(self,
             else:
                 raise ValueError(
                     f"Unknown finish reason: {finish_reasons[src_idx]}")
+
+        # Only record stats and do tracing when the entire request is done
+        if self._done:
             self.record_stats(output, req_perf_metrics_dict)
             self.do_tracing(output, req_perf_metrics_dict)
 
diff --git a/tests/unittest/_torch/sampler/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
@@ -146,3 +146,54 @@ def test_torch_sampler_with_multi_token_stop_words(model_path):
 
     assert len(text) > 0, "Should generate some text"
     assert stop_string not in text, f"Stop string '{repr(stop_string)}' should not appear in the output"
+
+
+@pytest.mark.high_cuda_memory
+def test_trtllm_sampler_best_of_with_logprobs(model_path):
+    """Test TRTLLMSampler with best_of > n and logprobs."""
+
+    llm = create_llm(model_path)
+
+    prompt = "The capital of France is"
+
+    sampling_config = SamplingParams(
+        max_tokens=10,
+        temperature=1.0,
+        top_k=2,
+        n=2,  # Return 2 sequences
+        best_of=3,  # Generate 3 candidates, pick best 2
+        logprobs=1  # Return log probabilities
+    )
+
+    outputs = llm.generate([prompt], sampling_params=sampling_config)
+
+    llm.shutdown()
+
+    assert len(outputs) == 1, "Should return one request output"
+
+    request_output = outputs[0]
+    completion_outputs = request_output.outputs
+
+    assert len(
+        completion_outputs
+    ) == 2, f"Expected 2 outputs (n=2), got {len(completion_outputs)}"
+
+    for i, output in enumerate(completion_outputs):
+        assert len(output.text) > 0, f"Output {i} should have generated text"
+
+        assert output.finish_reason is not None, \
+            f"Output {i} must have a finish_reason"
+
+        assert output.cumulative_logprob is not None, \
+            f"Output {i} should have cumulative_logprob when logprobs is requested"
+        assert isinstance(output.cumulative_logprob, (float, int)), \
+            f"Output {i} cumulative_logprob should be a number, got {type(output.cumulative_logprob)}"
+
+        assert output.logprobs is not None, \
+            f"Output {i} should have logprobs when logprobs=1"
+        assert len(output.logprobs) == len(output.token_ids), \
+            f"Output {i} should have logprobs for each token"
+
+    if len(completion_outputs) >= 2:
+        assert completion_outputs[0].cumulative_logprob >= completion_outputs[1].cumulative_logprob, \
+            "Outputs should be sorted by cumulative log probability (best first)"

Original file line number	Diff line number	Diff line change
`@@ -3135,7 +3135,11 @@ def update_requests_single_beam_single_step(self, state: SampleStateTRTLLM):`
`3135`	`3135`	`)`
`3136`	`3136`	`}`
`3137`	`3137`	`]`
`3138`		`- cum_log_probs = [cum_log_probs_host[seq_slot]]`
	`3138`	`+ cum_log_probs = [`
	`3139`	`+ cum_log_probs_host[seq_slot][0]`
	`3140`	`+ if isinstance(cum_log_probs_host[seq_slot], list)`
	`3141`	`+ else cum_log_probs_host[seq_slot]`
	`3142`	`+ ]`
`3139`	`3143`	`request.py_result.append_log_probs([log_probs], cum_log_probs)`
`3140`	`3144`	`idx += 1`
`3141`	`3145`