[TRTLLM-9488][feat] use FlashInfer.sampling by default (#9545)

ixlmar · web-flow · commit 84a153159445 · 2025-12-02T16:29:55.000+08:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2719,7 +2719,7 @@ class TorchLlmArgs(BaseLlmArgs):
     _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
 
     disable_flashinfer_sampling: bool = Field(
-        default=True,
+        default=False,
         description=
         "Disable the use of FlashInfer.sampling. This option is likely to be removed in the future.",
         status="prototype",
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -109,7 +109,7 @@ methods:
         status: beta
       disable_flashinfer_sampling:
         annotation: bool
-        default: True
+        default: False
         status: prototype
       moe_config:
         annotation: tensorrt_llm.llmapi.llm_args.MoeConfig