[chore] Mark TRTLLMSampler as deprecated and update documentation

Funatiq · Funatiq · commit de3c7508d682 · 2026-03-06T13:44:33.000Z
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/docs/source/features/sampling.md b/docs/source/features/sampling.md
@@ -16,30 +16,6 @@ The PyTorch backend supports a wide variety of features, listed below:
 
 ## General usage
 
-There are two sampling backends available.
-
-* Torch Sampler
-* TRTLLM Sampler
-
-Torch Sampler currently supports a superset of features of TRTLLM Sampler, and is intended as the long-term solution. One can specify which sampler to use explicitly with:
-
-```python
-from tensorrt_llm import LLM
-
-# Chooses TorchSampler explicitly
-llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
-          sampler_type="TorchSampler")
-
-# Chooses TRTLLMSampler explicitly
-llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
-          sampler_type="TRTLLMSampler")
-```
-
-By default, the sampling backend is chosen to be `auto`. This will use:
-
-* TRTLLM Sampler when using Beam Search.
-* Torch Sampler otherwise.
-
 Here is an example to run a model with basic usage of sampling parameters. This example prepares two identical prompts which will give different results due to the sampling parameters chosen:
 
 ```python
@@ -73,7 +49,7 @@ llm.generate(["Hello, my name is",
             sampling_params_1])
 ```
 
-### LLM API sampling behavior when using Torch Sampler
+### LLM API sampling behavior
 
 * The sampling is controlled via `SamplingParams`.
 
@@ -105,17 +81,17 @@ llm.generate(["Hello, my name is",
 
 ### Performance
 
-The Torch Sampler leverages the optimized sampling kernels provided by
+The sampler leverages the optimized sampling kernels provided by
 [FlashInfer](https://docs.flashinfer.ai/api/sampling.html). The sampler
 also uses the [sorting-free implementations](https://flashinfer.ai/2025/03/10/sampling.html)
 whenever possible. This optimization does not compute the complete set of token sampling probabilities
 (after top-k / top-p masking etc.), which typically can be omitted unless requested by the user or
 required for speculative decoding (rejection sampling).
-In case of unexpected problems, the use of FlashInfer in Torch Sampler can
+In case of unexpected problems, the use of FlashInfer in the sampler can
 be disabled via the `disable_flashinfer_sampling` config option (note that this option is likely
 to be removed in a future TensorRT LLM release).
 
-Moreover, Torch Sampler internally batches requests with compatible sampling parameters. This
+Moreover, the sampler internally batches requests with compatible sampling parameters. This
 can greatly reduce the overall latency of the sampling step when request batches are comprised
 of requests with very heterogeneous sampling strategies (e.g. a mix of requests using greedy and top-p-after-top-k sampling).
 
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -1278,9 +1278,10 @@ def instantiate_sampler(
     if mm_encoder_only:
         # NOTE: handle model outputs specially for mm encoder executor/engine
         return EarlyStopWithMMResult()
-    if llm_args.sampler_type == SamplerType.TRTLLMSampler or (
-            llm_args.sampler_type == SamplerType.auto
-            and decoding_mode.isBeamSearch()):
+    if llm_args.sampler_type == SamplerType.TRTLLMSampler:
+        logger.warning(
+            "TRTLLMSampler is deprecated and will be removed in release 1.4. Please use TorchSampler instead."
+        )
         logger.debug(f"DecodingMode: {decoding_mode.name}")
         return TRTLLMSampler(engine.model,
                              engine.dtype,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -3014,8 +3014,9 @@ class TorchLlmArgs(BaseLlmArgs):
     sampler_type: Union[str, SamplerType] = Field(
         default=SamplerType.auto,
         description=
-        "The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. Defaults to auto, which will use TorchSampler unless BeamSearch is requested.",
-        status="beta")
+        "The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. Defaults to auto, which will use TorchSampler. "
+        "TRTLLMSampler is deprecated and will be removed in release 1.4.",
+        status="deprecated")
 
     sampler_force_async_worker: bool = Field(
         default=False,
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -122,7 +122,7 @@ methods:
       sampler_type:
         annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType]
         default: auto
-        status: beta
+        status: deprecated
       sampler_force_async_worker:
         annotation: bool
         default: False