Skip to content

Commit f11eea7

Browse files
authored
[TRTLLM-10303][feat] Deprecate trtllm-serve CLI options (NVIDIA#12106)
Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
1 parent adfc542 commit f11eea7

File tree

3 files changed

+38
-16
lines changed

3 files changed

+38
-16
lines changed

tensorrt_llm/commands/serve.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def get_llm_args(
151151
trust_remote_code: bool = False,
152152
revision: Optional[str] = None,
153153
reasoning_parser: Optional[str] = None,
154-
fail_fast_on_attention_window_too_large: bool = False,
154+
fail_fast_on_attention_window_too_large: bool = True,
155155
otlp_traces_endpoint: Optional[str] = None,
156156
enable_chunked_prefill: bool = False,
157157
**llm_args_extra_dict: Any):
@@ -602,12 +602,15 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
602602
default=None,
603603
help=help_info_with_stability_tag("expert parallelism size",
604604
"beta"))
605-
@click.option("--moe_cluster_parallel_size",
606-
"--cluster_size",
607-
type=int,
608-
default=None,
609-
help=help_info_with_stability_tag(
610-
"expert cluster parallelism size", "beta"))
605+
@click.option(
606+
"--moe_cluster_parallel_size",
607+
"--cluster_size",
608+
type=int,
609+
default=None,
610+
help=help_info_with_stability_tag(
611+
"[Deprecated] Expert cluster parallelism size. "
612+
"This option is no longer supported and will be removed in a future release.",
613+
"deprecated"))
611614
@click.option(
612615
"--gpus_per_node",
613616
type=int,
@@ -686,10 +689,12 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
686689
@click.option(
687690
"--fail_fast_on_attention_window_too_large",
688691
is_flag=True,
689-
default=False,
692+
default=True,
690693
help=help_info_with_stability_tag(
691-
"Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.",
692-
"prototype"))
694+
"[Deprecated] Exit with runtime error when attention window is too large "
695+
"to fit even a single sequence in the KV cache. Now defaults to True. "
696+
"This flag only affects the TRT backend and will be removed in a future release.",
697+
"deprecated"))
693698
@click.option("--otlp_traces_endpoint",
694699
type=str,
695700
default=None,
@@ -762,6 +767,18 @@ def serve(
762767
"""
763768
logger.set_level(log_level)
764769

770+
if moe_cluster_parallel_size is not None:
771+
logger.warning(
772+
"--moe_cluster_parallel_size / --cluster_size is deprecated and "
773+
"no longer supported. This option will be removed in a future release."
774+
)
775+
776+
if "--fail_fast_on_attention_window_too_large" in sys.argv:
777+
logger.warning(
778+
"--fail_fast_on_attention_window_too_large is deprecated. "
779+
"It now defaults to True and will be removed in a future release. "
780+
"This flag only affects the TRT backend.")
781+
765782
for custom_module_dir in custom_module_dirs:
766783
try:
767784
import_custom_module_from_dir(custom_module_dir)
@@ -994,8 +1011,8 @@ def serve_encoder(model: str, host: str, port: int, log_level: str,
9941011
"--metrics-log-interval",
9951012
type=int,
9961013
default=0,
997-
help=
998-
"The interval of logging metrics in seconds. Set to 0 to disable metrics logging."
1014+
help="[Deprecated] The interval of logging metrics in seconds. "
1015+
"This option is not connected to any functionality and will be removed in a future release."
9991016
)
10001017
def disaggregated(
10011018
config_file: Optional[str],
@@ -1009,6 +1026,11 @@ def disaggregated(
10091026

10101027
logger.set_level(log_level)
10111028

1029+
if metrics_log_interval != 0:
1030+
logger.warning(
1031+
"--metrics-log-interval is deprecated and not connected to any "
1032+
"functionality. This option will be removed in a future release.")
1033+
10121034
disagg_cfg = parse_disagg_config_file(config_file)
10131035

10141036
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

tensorrt_llm/llmapi/llm_args.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2370,7 +2370,7 @@ class BaseLlmArgs(StrictBaseModel):
23702370
moe_cluster_parallel_size: Optional[int] = Field(
23712371
default=None,
23722372
description="The cluster parallel size for MoE model's expert weights.",
2373-
status="beta")
2373+
status="deprecated")
23742374

23752375
moe_tensor_parallel_size: Optional[int] = Field(
23762376
default=None,
@@ -2762,10 +2762,10 @@ class TrtLlmArgs(BaseLlmArgs):
27622762
description="The workspace for the model.")
27632763

27642764
fail_fast_on_attention_window_too_large: bool = Field(
2765-
default=False,
2765+
default=True,
27662766
description=
27672767
"Fail fast when attention window is too large to fit even a single sequence in the KV cache.",
2768-
status="prototype")
2768+
status="deprecated")
27692769

27702770
# Once set, the model will reuse the build_cache
27712771
enable_build_cache: Union[BuildCacheConfig,

tests/unittest/api_stability/references/llm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ methods:
99
moe_cluster_parallel_size:
1010
annotation: Optional[int]
1111
default: null
12-
status: beta
12+
status: deprecated
1313
enable_attention_dp:
1414
annotation: bool
1515
default: False

0 commit comments

Comments
 (0)