[TRTLLM-9687][fix] Enable pinned memory for tensor allocations in TorchSampler

stnie · stnie · commit bfb2195ab5e6 · 2025-12-08T12:04:05.000+01:00
- Updated tensor allocation in TorchSampler to use pinned memory for improved performance during D2H copies.
- Modified test_sampled_token_always_in_logprobs to include logprobs_mode parameter for enhanced testing of log probabilities.

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -2596,11 +2596,11 @@ def _process_logprobs(
         sampled_rank_cuda = group_logprobs_cuda.sum(dim=-1).to(torch.int32)
 
         # Use a single D2H copy to reduce overheads
-        topk_vals = torch.empty_like(topk_vals_cuda, device="cpu", pin_memory=False)
-        topk_indices = torch.empty_like(topk_indices_cuda, device="cpu", pin_memory=False)
-        sampled_vals = torch.empty_like(sampled_vals_cuda, device="cpu", pin_memory=False)
-        sampled_indices = torch.empty_like(sampled_indices_cuda, device="cpu", pin_memory=False)
-        sampled_rank = torch.empty_like(sampled_rank_cuda, device="cpu", pin_memory=False)
+        topk_vals = torch.empty_like(topk_vals_cuda, device="cpu", pin_memory=True)
+        topk_indices = torch.empty_like(topk_indices_cuda, device="cpu", pin_memory=True)
+        sampled_vals = torch.empty_like(sampled_vals_cuda, device="cpu", pin_memory=True)
+        sampled_indices = torch.empty_like(sampled_indices_cuda, device="cpu", pin_memory=True)
+        sampled_rank = torch.empty_like(sampled_rank_cuda, device="cpu", pin_memory=True)
 
         topk_vals.copy_(topk_vals_cuda, non_blocking=True)
         topk_indices.copy_(topk_indices_cuda, non_blocking=True)
diff --git a/tests/unittest/_torch/sampler/test_logits_logprobs.py b/tests/unittest/_torch/sampler/test_logits_logprobs.py
@@ -258,8 +258,10 @@ def test_generate_async_with_return_logits(
 
 @pytest.mark.parametrize("logprobs_k", [0, 1, 3],
                          ids=["top_0", "top_1", "top_3"])
+@pytest.mark.parametrize("logprobs_mode", ["raw", "processed"])
 @pytest.mark.threadleak(enabled=False)
-def test_sampled_token_always_in_logprobs(logprobs_k: int, simple_llm: LLM):
+def test_sampled_token_always_in_logprobs(logprobs_k: int, logprobs_mode: str,
+                                          simple_llm: LLM):
     """Two scenarios:
         - logprobs=0: Returns only sampled token (1 element)
         - logprobs=K (K>0): Returns top-K tokens + sampled token if not in top-K (up to K+1 elements)
@@ -270,6 +272,7 @@ def test_sampled_token_always_in_logprobs(logprobs_k: int, simple_llm: LLM):
         temperature=0.7,
         top_p=0.9,
         logprobs=logprobs_k,
+        logprobs_mode=logprobs_mode,
     )
 
     for output in simple_llm.generate(["The future of AI is"],
@@ -474,6 +477,8 @@ def test_processed_logprobs_e2e(logprobs_k: int, simple_llm: LLM):
         num_logits = len(generation_logits)
 
         for token_idx, token_logprobs_dict in enumerate(logprobs[:num_logits]):
+            assert token_ids[
+                token_idx] in token_logprobs_dict, "Sampled token not in logprobs"
 
             logits_for_token = generation_logits[token_idx:token_idx + 1]
             topk = sampling_params_list[req_idx].top_k