Skip to content

Commit 1d4fb89

Browse files
authored
[TRTLLM-8241][feat] Aliasing to comply to LlmArgs (#9586)
Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
1 parent 80ff901 commit 1d4fb89

File tree

1 file changed

+37
-27
lines changed

1 file changed

+37
-27
lines changed

tensorrt_llm/commands/serve.py

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -299,20 +299,28 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
299299
default=BuildConfig.model_fields["max_seq_len"].default,
300300
help="Maximum total length of one request, including prompt and outputs. "
301301
"If unspecified, the value is deduced from the model config.")
302-
@click.option("--tp_size", type=int, default=1, help='Tensor parallelism size.')
303-
@click.option("--pp_size",
302+
@click.option("--tensor_parallel_size",
303+
"--tp_size",
304+
type=int,
305+
default=1,
306+
help='Tensor parallelism size.')
307+
@click.option("--pipeline_parallel_size",
308+
"--pp_size",
304309
type=int,
305310
default=1,
306311
help='Pipeline parallelism size.')
307-
@click.option("--cp_size",
312+
@click.option("--context_parallel_size",
313+
"--cp_size",
308314
type=int,
309315
default=1,
310316
help='Context parallelism size.')
311-
@click.option("--ep_size",
317+
@click.option("--moe_expert_parallel_size",
318+
"--ep_size",
312319
type=int,
313320
default=None,
314321
help="expert parallelism size")
315-
@click.option("--cluster_size",
322+
@click.option("--moe_cluster_parallel_size",
323+
"--cluster_size",
316324
type=int,
317325
default=None,
318326
help="expert cluster parallelism size")
@@ -321,7 +329,8 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
321329
default=None,
322330
help="Number of GPUs per node. Default to None, and it will be "
323331
"detected automatically.")
324-
@click.option("--kv_cache_free_gpu_memory_fraction",
332+
@click.option("--free_gpu_memory_fraction",
333+
"--kv_cache_free_gpu_memory_fraction",
325334
type=float,
326335
default=0.9,
327336
help="Free GPU memory fraction reserved for KV Cache, "
@@ -400,20 +409,22 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
400409
default=None,
401410
help="[Experimental] Specify a custom chat template. "
402411
"Can be a file path or one-liner template string")
403-
def serve(
404-
model: str, tokenizer: Optional[str], host: str, port: int,
405-
log_level: str, backend: str, max_beam_width: int, max_batch_size: int,
406-
max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
407-
cp_size: int, ep_size: Optional[int], cluster_size: Optional[int],
408-
gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
409-
num_postprocess_workers: int, trust_remote_code: bool,
410-
revision: Optional[str], extra_llm_api_options: Optional[str],
411-
reasoning_parser: Optional[str], tool_parser: Optional[str],
412-
metadata_server_config_file: Optional[str], server_role: Optional[str],
413-
fail_fast_on_attention_window_too_large: bool,
414-
otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
415-
disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
416-
custom_module_dirs: list[Path], chat_template: Optional[str]):
412+
def serve(model: str, tokenizer: Optional[str], host: str, port: int,
413+
log_level: str, backend: str, max_beam_width: int,
414+
max_batch_size: int, max_num_tokens: int, max_seq_len: int,
415+
tensor_parallel_size: int, pipeline_parallel_size: int,
416+
context_parallel_size: int, moe_expert_parallel_size: Optional[int],
417+
moe_cluster_parallel_size: Optional[int],
418+
gpus_per_node: Optional[int], free_gpu_memory_fraction: float,
419+
num_postprocess_workers: int, trust_remote_code: bool,
420+
revision: Optional[str], extra_llm_api_options: Optional[str],
421+
reasoning_parser: Optional[str], tool_parser: Optional[str],
422+
metadata_server_config_file: Optional[str],
423+
server_role: Optional[str],
424+
fail_fast_on_attention_window_too_large: bool,
425+
otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
426+
disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
427+
custom_module_dirs: list[Path], chat_template: Optional[str]):
417428
"""Running an OpenAI API compatible server
418429
419430
MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -427,7 +438,6 @@ def serve(
427438
logger.error(
428439
f"Failed to import custom module from {custom_module_dir}: {e}")
429440
raise e
430-
431441
llm_args, _ = get_llm_args(
432442
model=model,
433443
tokenizer=tokenizer,
@@ -436,13 +446,13 @@ def serve(
436446
max_batch_size=max_batch_size,
437447
max_num_tokens=max_num_tokens,
438448
max_seq_len=max_seq_len,
439-
tensor_parallel_size=tp_size,
440-
pipeline_parallel_size=pp_size,
441-
context_parallel_size=cp_size,
442-
moe_expert_parallel_size=ep_size,
443-
moe_cluster_parallel_size=cluster_size,
449+
tensor_parallel_size=tensor_parallel_size,
450+
pipeline_parallel_size=pipeline_parallel_size,
451+
context_parallel_size=context_parallel_size,
452+
moe_expert_parallel_size=moe_expert_parallel_size,
453+
moe_cluster_parallel_size=moe_cluster_parallel_size,
444454
gpus_per_node=gpus_per_node,
445-
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
455+
free_gpu_memory_fraction=free_gpu_memory_fraction,
446456
num_postprocess_workers=num_postprocess_workers,
447457
trust_remote_code=trust_remote_code,
448458
revision=revision,

0 commit comments

Comments
 (0)