@@ -299,20 +299,28 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
299299 default = BuildConfig .model_fields ["max_seq_len" ].default ,
300300 help = "Maximum total length of one request, including prompt and outputs. "
301301 "If unspecified, the value is deduced from the model config." )
302- @click .option ("--tp_size" , type = int , default = 1 , help = 'Tensor parallelism size.' )
303- @click .option ("--pp_size" ,
302+ @click .option ("--tensor_parallel_size" ,
303+ "--tp_size" ,
304+ type = int ,
305+ default = 1 ,
306+ help = 'Tensor parallelism size.' )
307+ @click .option ("--pipeline_parallel_size" ,
308+ "--pp_size" ,
304309 type = int ,
305310 default = 1 ,
306311 help = 'Pipeline parallelism size.' )
307- @click .option ("--cp_size" ,
312+ @click .option ("--context_parallel_size" ,
313+ "--cp_size" ,
308314 type = int ,
309315 default = 1 ,
310316 help = 'Context parallelism size.' )
311- @click .option ("--ep_size" ,
317+ @click .option ("--moe_expert_parallel_size" ,
318+ "--ep_size" ,
312319 type = int ,
313320 default = None ,
314321 help = "expert parallelism size" )
315- @click .option ("--cluster_size" ,
322+ @click .option ("--moe_cluster_parallel_size" ,
323+ "--cluster_size" ,
316324 type = int ,
317325 default = None ,
318326 help = "expert cluster parallelism size" )
@@ -321,7 +329,8 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
321329 default = None ,
322330 help = "Number of GPUs per node. Default to None, and it will be "
323331 "detected automatically." )
324- @click .option ("--kv_cache_free_gpu_memory_fraction" ,
332+ @click .option ("--free_gpu_memory_fraction" ,
333+ "--kv_cache_free_gpu_memory_fraction" ,
325334 type = float ,
326335 default = 0.9 ,
327336 help = "Free GPU memory fraction reserved for KV Cache, "
@@ -400,20 +409,22 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
400409 default = None ,
401410 help = "[Experimental] Specify a custom chat template. "
402411 "Can be a file path or one-liner template string" )
403- def serve (
404- model : str , tokenizer : Optional [str ], host : str , port : int ,
405- log_level : str , backend : str , max_beam_width : int , max_batch_size : int ,
406- max_num_tokens : int , max_seq_len : int , tp_size : int , pp_size : int ,
407- cp_size : int , ep_size : Optional [int ], cluster_size : Optional [int ],
408- gpus_per_node : Optional [int ], kv_cache_free_gpu_memory_fraction : float ,
409- num_postprocess_workers : int , trust_remote_code : bool ,
410- revision : Optional [str ], extra_llm_api_options : Optional [str ],
411- reasoning_parser : Optional [str ], tool_parser : Optional [str ],
412- metadata_server_config_file : Optional [str ], server_role : Optional [str ],
413- fail_fast_on_attention_window_too_large : bool ,
414- otlp_traces_endpoint : Optional [str ], enable_chunked_prefill : bool ,
415- disagg_cluster_uri : Optional [str ], media_io_kwargs : Optional [str ],
416- custom_module_dirs : list [Path ], chat_template : Optional [str ]):
412+ def serve (model : str , tokenizer : Optional [str ], host : str , port : int ,
413+ log_level : str , backend : str , max_beam_width : int ,
414+ max_batch_size : int , max_num_tokens : int , max_seq_len : int ,
415+ tensor_parallel_size : int , pipeline_parallel_size : int ,
416+ context_parallel_size : int , moe_expert_parallel_size : Optional [int ],
417+ moe_cluster_parallel_size : Optional [int ],
418+ gpus_per_node : Optional [int ], free_gpu_memory_fraction : float ,
419+ num_postprocess_workers : int , trust_remote_code : bool ,
420+ revision : Optional [str ], extra_llm_api_options : Optional [str ],
421+ reasoning_parser : Optional [str ], tool_parser : Optional [str ],
422+ metadata_server_config_file : Optional [str ],
423+ server_role : Optional [str ],
424+ fail_fast_on_attention_window_too_large : bool ,
425+ otlp_traces_endpoint : Optional [str ], enable_chunked_prefill : bool ,
426+ disagg_cluster_uri : Optional [str ], media_io_kwargs : Optional [str ],
427+ custom_module_dirs : list [Path ], chat_template : Optional [str ]):
417428 """Running an OpenAI API compatible server
418429
419430 MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -427,7 +438,6 @@ def serve(
427438 logger .error (
428439 f"Failed to import custom module from { custom_module_dir } : { e } " )
429440 raise e
430-
431441 llm_args , _ = get_llm_args (
432442 model = model ,
433443 tokenizer = tokenizer ,
@@ -436,13 +446,13 @@ def serve(
436446 max_batch_size = max_batch_size ,
437447 max_num_tokens = max_num_tokens ,
438448 max_seq_len = max_seq_len ,
439- tensor_parallel_size = tp_size ,
440- pipeline_parallel_size = pp_size ,
441- context_parallel_size = cp_size ,
442- moe_expert_parallel_size = ep_size ,
443- moe_cluster_parallel_size = cluster_size ,
449+ tensor_parallel_size = tensor_parallel_size ,
450+ pipeline_parallel_size = pipeline_parallel_size ,
451+ context_parallel_size = context_parallel_size ,
452+ moe_expert_parallel_size = moe_expert_parallel_size ,
453+ moe_cluster_parallel_size = moe_cluster_parallel_size ,
444454 gpus_per_node = gpus_per_node ,
445- free_gpu_memory_fraction = kv_cache_free_gpu_memory_fraction ,
455+ free_gpu_memory_fraction = free_gpu_memory_fraction ,
446456 num_postprocess_workers = num_postprocess_workers ,
447457 trust_remote_code = trust_remote_code ,
448458 revision = revision ,
0 commit comments