[TRTLLM-10303][feat] Deprecate trtllm-serve CLI options (NVIDIA#12106)

JunyiXu-nv · web-flow · commit f11eea751504 · 2026-03-12T22:08:17.000+08:00
Signed-off-by: Junyi Xu &lt;219237550+JunyiXu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -151,7 +151,7 @@ def get_llm_args(
         trust_remote_code: bool = False,
         revision: Optional[str] = None,
         reasoning_parser: Optional[str] = None,
-        fail_fast_on_attention_window_too_large: bool = False,
+        fail_fast_on_attention_window_too_large: bool = True,
         otlp_traces_endpoint: Optional[str] = None,
         enable_chunked_prefill: bool = False,
         **llm_args_extra_dict: Any):
@@ -602,12 +602,15 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
               default=None,
               help=help_info_with_stability_tag("expert parallelism size",
                                                 "beta"))
-@click.option("--moe_cluster_parallel_size",
-              "--cluster_size",
-              type=int,
-              default=None,
-              help=help_info_with_stability_tag(
-                  "expert cluster parallelism size", "beta"))
+@click.option(
+    "--moe_cluster_parallel_size",
+    "--cluster_size",
+    type=int,
+    default=None,
+    help=help_info_with_stability_tag(
+        "[Deprecated] Expert cluster parallelism size. "
+        "This option is no longer supported and will be removed in a future release.",
+        "deprecated"))
 @click.option(
     "--gpus_per_node",
     type=int,
@@ -686,10 +689,12 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
 @click.option(
     "--fail_fast_on_attention_window_too_large",
     is_flag=True,
-    default=False,
+    default=True,
     help=help_info_with_stability_tag(
-        "Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.",
-        "prototype"))
+        "[Deprecated] Exit with runtime error when attention window is too large "
+        "to fit even a single sequence in the KV cache. Now defaults to True. "
+        "This flag only affects the TRT backend and will be removed in a future release.",
+        "deprecated"))
 @click.option("--otlp_traces_endpoint",
               type=str,
               default=None,
@@ -762,6 +767,18 @@ def serve(
     """
     logger.set_level(log_level)
 
+    if moe_cluster_parallel_size is not None:
+        logger.warning(
+            "--moe_cluster_parallel_size / --cluster_size is deprecated and "
+            "no longer supported. This option will be removed in a future release."
+        )
+
+    if "--fail_fast_on_attention_window_too_large" in sys.argv:
+        logger.warning(
+            "--fail_fast_on_attention_window_too_large is deprecated. "
+            "It now defaults to True and will be removed in a future release. "
+            "This flag only affects the TRT backend.")
+
     for custom_module_dir in custom_module_dirs:
         try:
             import_custom_module_from_dir(custom_module_dir)
@@ -994,8 +1011,8 @@ def serve_encoder(model: str, host: str, port: int, log_level: str,
     "--metrics-log-interval",
     type=int,
     default=0,
-    help=
-    "The interval of logging metrics in seconds. Set to 0 to disable metrics logging."
+    help="[Deprecated] The interval of logging metrics in seconds. "
+    "This option is not connected to any functionality and will be removed in a future release."
 )
 def disaggregated(
     config_file: Optional[str],
@@ -1009,6 +1026,11 @@ def disaggregated(
 
     logger.set_level(log_level)
 
+    if metrics_log_interval != 0:
+        logger.warning(
+            "--metrics-log-interval is deprecated and not connected to any "
+            "functionality. This option will be removed in a future release.")
+
     disagg_cfg = parse_disagg_config_file(config_file)
 
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2370,7 +2370,7 @@ class BaseLlmArgs(StrictBaseModel):
     moe_cluster_parallel_size: Optional[int] = Field(
         default=None,
         description="The cluster parallel size for MoE model's expert weights.",
-        status="beta")
+        status="deprecated")
 
     moe_tensor_parallel_size: Optional[int] = Field(
         default=None,
@@ -2762,10 +2762,10 @@ class TrtLlmArgs(BaseLlmArgs):
                                      description="The workspace for the model.")
 
     fail_fast_on_attention_window_too_large: bool = Field(
-        default=False,
+        default=True,
         description=
         "Fail fast when attention window is too large to fit even a single sequence in the KV cache.",
-        status="prototype")
+        status="deprecated")
 
     # Once set, the model will reuse the build_cache
     enable_build_cache: Union[BuildCacheConfig,
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -9,7 +9,7 @@ methods:
       moe_cluster_parallel_size:
         annotation: Optional[int]
         default: null
-        status: beta
+        status: deprecated
       enable_attention_dp:
         annotation: bool
         default: False