@@ -95,6 +95,7 @@ def get_llm_args(
9595 free_gpu_memory_fraction : float = 0.9 ,
9696 num_postprocess_workers : int = 0 ,
9797 trust_remote_code : bool = False ,
98+ revision : Optional [str ] = None ,
9899 reasoning_parser : Optional [str ] = None ,
99100 fail_fast_on_attention_window_too_large : bool = False ,
100101 otlp_traces_endpoint : Optional [str ] = None ,
@@ -129,6 +130,7 @@ def get_llm_args(
129130 "moe_expert_parallel_size" : moe_expert_parallel_size ,
130131 "gpus_per_node" : gpus_per_node ,
131132 "trust_remote_code" : trust_remote_code ,
133+ "revision" : revision ,
132134 "build_config" : build_config ,
133135 "max_batch_size" : max_batch_size ,
134136 "max_num_tokens" : max_num_tokens ,
@@ -317,6 +319,11 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
317319 is_flag = True ,
318320 default = False ,
319321 help = "Flag for HF transformers." )
322+ @click .option ("--revision" ,
323+ type = str ,
324+ default = None ,
325+ help = "The revision to use for the HuggingFace model "
326+ "(branch name, tag name, or commit id)." )
320327@click .option (
321328 "--extra_llm_api_options" ,
322329 type = str ,
@@ -381,9 +388,9 @@ def serve(
381388 ep_size : Optional [int ], cluster_size : Optional [int ],
382389 gpus_per_node : Optional [int ], kv_cache_free_gpu_memory_fraction : float ,
383390 num_postprocess_workers : int , trust_remote_code : bool ,
384- extra_llm_api_options : Optional [str ], reasoning_parser : Optional [str ],
385- tool_parser : Optional [str ], metadata_server_config_file : Optional [str ],
386- server_role : Optional [str ],
391+ revision : Optional [str ], extra_llm_api_options : Optional [str ],
392+ reasoning_parser : Optional [str ], tool_parser : Optional [str ],
393+ metadata_server_config_file : Optional [ str ], server_role : Optional [str ],
387394 fail_fast_on_attention_window_too_large : bool ,
388395 otlp_traces_endpoint : Optional [str ], enable_chunked_prefill : bool ,
389396 disagg_cluster_uri : Optional [str ], media_io_kwargs : Optional [str ],
@@ -418,6 +425,7 @@ def serve(
418425 free_gpu_memory_fraction = kv_cache_free_gpu_memory_fraction ,
419426 num_postprocess_workers = num_postprocess_workers ,
420427 trust_remote_code = trust_remote_code ,
428+ revision = revision ,
421429 reasoning_parser = reasoning_parser ,
422430 fail_fast_on_attention_window_too_large =
423431 fail_fast_on_attention_window_too_large ,
0 commit comments