@@ -151,7 +151,7 @@ def get_llm_args(
151151 trust_remote_code : bool = False ,
152152 revision : Optional [str ] = None ,
153153 reasoning_parser : Optional [str ] = None ,
154- fail_fast_on_attention_window_too_large : bool = False ,
154+ fail_fast_on_attention_window_too_large : bool = True ,
155155 otlp_traces_endpoint : Optional [str ] = None ,
156156 enable_chunked_prefill : bool = False ,
157157 ** llm_args_extra_dict : Any ):
@@ -602,12 +602,15 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
602602 default = None ,
603603 help = help_info_with_stability_tag ("expert parallelism size" ,
604604 "beta" ))
605- @click .option ("--moe_cluster_parallel_size" ,
606- "--cluster_size" ,
607- type = int ,
608- default = None ,
609- help = help_info_with_stability_tag (
610- "expert cluster parallelism size" , "beta" ))
605+ @click .option (
606+ "--moe_cluster_parallel_size" ,
607+ "--cluster_size" ,
608+ type = int ,
609+ default = None ,
610+ help = help_info_with_stability_tag (
611+ "[Deprecated] Expert cluster parallelism size. "
612+ "This option is no longer supported and will be removed in a future release." ,
613+ "deprecated" ))
611614@click .option (
612615 "--gpus_per_node" ,
613616 type = int ,
@@ -686,10 +689,12 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
686689@click .option (
687690 "--fail_fast_on_attention_window_too_large" ,
688691 is_flag = True ,
689- default = False ,
692+ default = True ,
690693 help = help_info_with_stability_tag (
691- "Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache." ,
692- "prototype" ))
694+ "[Deprecated] Exit with runtime error when attention window is too large "
695+ "to fit even a single sequence in the KV cache. Now defaults to True. "
696+ "This flag only affects the TRT backend and will be removed in a future release." ,
697+ "deprecated" ))
693698@click .option ("--otlp_traces_endpoint" ,
694699 type = str ,
695700 default = None ,
@@ -762,6 +767,18 @@ def serve(
762767 """
763768 logger .set_level (log_level )
764769
770+ if moe_cluster_parallel_size is not None :
771+ logger .warning (
772+ "--moe_cluster_parallel_size / --cluster_size is deprecated and "
773+ "no longer supported. This option will be removed in a future release."
774+ )
775+
776+ if "--fail_fast_on_attention_window_too_large" in sys .argv :
777+ logger .warning (
778+ "--fail_fast_on_attention_window_too_large is deprecated. "
779+ "It now defaults to True and will be removed in a future release. "
780+ "This flag only affects the TRT backend." )
781+
765782 for custom_module_dir in custom_module_dirs :
766783 try :
767784 import_custom_module_from_dir (custom_module_dir )
@@ -994,8 +1011,8 @@ def serve_encoder(model: str, host: str, port: int, log_level: str,
9941011 "--metrics-log-interval" ,
9951012 type = int ,
9961013 default = 0 ,
997- help =
998- "The interval of logging metrics in seconds. Set to 0 to disable metrics logging ."
1014+ help = "[Deprecated] The interval of logging metrics in seconds. "
1015+ "This option is not connected to any functionality and will be removed in a future release ."
9991016)
10001017def disaggregated (
10011018 config_file : Optional [str ],
@@ -1009,6 +1026,11 @@ def disaggregated(
10091026
10101027 logger .set_level (log_level )
10111028
1029+ if metrics_log_interval != 0 :
1030+ logger .warning (
1031+ "--metrics-log-interval is deprecated and not connected to any "
1032+ "functionality. This option will be removed in a future release." )
1033+
10121034 disagg_cfg = parse_disagg_config_file (config_file )
10131035
10141036 with socket .socket (socket .AF_INET , socket .SOCK_STREAM ) as s :
0 commit comments