Skip to content

Commit c554fe4

Browse files
achartiercodego7250
authored andcommitted
[NVIDIA#9463][feat] Add revision option to trtllm commands (NVIDIA#9498)
Signed-off-by: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
1 parent ff269c5 commit c554fe4

File tree

9 files changed

+41
-11
lines changed

9 files changed

+41
-11
lines changed

tensorrt_llm/bench/benchmark/low_latency.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def latency_command(
225225
if options.backend and options.backend.lower(
226226
) in ALL_SUPPORTED_BACKENDS and options.backend.lower() != "tensorrt":
227227
if bench_env.checkpoint_path is None:
228-
snapshot_download(options.model)
228+
snapshot_download(options.model, revision=bench_env.revision)
229229

230230
exec_settings = get_settings(params, metadata, bench_env.model,
231231
bench_env.checkpoint_path)
@@ -250,6 +250,7 @@ def latency_command(
250250
param_hint="backend")
251251

252252
exec_settings["model"] = options.model
253+
exec_settings["revision"] = bench_env.revision
253254
engine_tokens = exec_settings["settings_config"]["max_num_tokens"]
254255

255256
# Update configuration with runtime options

tensorrt_llm/bench/benchmark/throughput.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ def throughput_command(
350350
# If we're dealing with a model name, perform a snapshot download to
351351
# make sure we have a local copy of the model.
352352
if bench_env.checkpoint_path is None:
353-
snapshot_download(options.model)
353+
snapshot_download(options.model, revision=bench_env.revision)
354354

355355
exec_settings = get_settings(params, metadata, bench_env.model,
356356
bench_env.checkpoint_path)
@@ -376,6 +376,7 @@ def throughput_command(
376376
param_hint="backend")
377377

378378
exec_settings["model"] = options.model
379+
exec_settings["revision"] = bench_env.revision
379380
engine_bs = exec_settings["settings_config"]["max_batch_size"]
380381
engine_tokens = exec_settings["settings_config"]["max_num_tokens"]
381382

tensorrt_llm/bench/dataclasses/configuration.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class RuntimeConfig(BaseModel):
2525
model: str
2626
model_path: Optional[Path] = None
2727
engine_dir: Optional[Path] = None
28+
revision: Optional[str] = None
2829
sw_version: str
2930
settings_config: ExecutorSettingsConfig
3031
# TODO: this is a dict corresponding to the Mapping class, the type should be

tensorrt_llm/bench/dataclasses/general.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class BenchmarkEnvironment(BaseModel):
1414
model: str
1515
checkpoint_path: Optional[Path]
1616
workspace: Path
17+
revision: Optional[str] = None
1718

1819

1920
class InferenceRequest(BaseModel):

tensorrt_llm/bench/dataclasses/reporting.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ def get_statistics_dict(self) -> Dict[str, Any]:
306306
"model": self.rt_cfg.model,
307307
"model_path": str(self.rt_cfg.model_path),
308308
"engine_dir": str(self.rt_cfg.engine_dir),
309+
"revision": self.rt_cfg.revision,
309310
"version": self.rt_cfg.sw_version,
310311
},
311312
}
@@ -539,6 +540,7 @@ def report_statistics(self) -> None:
539540
"===========================================================\n"
540541
f"Model:\t\t\t{engine['model']}\n"
541542
f"Model Path:\t\t{engine['model_path']}\n"
543+
f"Revision:\t\t{engine['revision'] or 'N/A'}\n"
542544
f"Engine Directory:\t{engine['engine_dir']}\n"
543545
f"TensorRT LLM Version:\t{engine['version']}\n"
544546
f"Dtype:\t\t\t{pretrain_cfg['dtype']}\n"
@@ -554,6 +556,7 @@ def report_statistics(self) -> None:
554556
"===========================================================\n"
555557
f"Model:\t\t\t{engine['model']}\n"
556558
f"Model Path:\t\t{engine['model_path']}\n"
559+
f"Revision:\t\t{engine['revision'] or 'N/A'}\n"
557560
f"TensorRT LLM Version:\t{engine['version']}\n"
558561
f"Dtype:\t\t\t{engine['dtype']}\n"
559562
f"KV Cache Dtype:\t\t{engine['kv_cache_dtype']}\n"

tensorrt_llm/bench/utils/data.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,21 +89,21 @@ def create_dataset_from_stream(
8989
while (line := stream.readline()) and len(task_ids) < max_requests:
9090
# We expect the data to come in as a JSON string.
9191
# For example:
92-
# {"prompt": "Generate an infinite response to the following:
92+
# {"task_id": 1, "prompt": "Generate an infinite response to the following:
9393
# There once was a man who.", "output_tokens": 1000}
9494
#
9595
# For multimodal data, the data should be of the form:
96-
# {"prompt": "Generate an infinite response to the following:
96+
# {"task_id": 1, "prompt": "Generate an infinite response to the following:
9797
# There once was a man who.", "output_tokens": 1000,
9898
# "media_paths": ["/path/to/image1.jpg", "/path/to/image2.jpg"]}
9999
#
100100
# For LoRA data, the data should be of the form:
101-
# {"prompt": "Generate an infinite response to the following:
101+
# {"task_id": 1, "prompt": "Generate an infinite response to the following:
102102
# There once was a man who.", "output_tokens": 1000,
103103
# "lora_request": {"lora_name": "my_lora", "lora_int_id": 1, "lora_path": "/path/to/lora"}}
104104
#
105105
# Each line should be a complete JSON dictionary with no indentation
106-
# or newline characters.
106+
# or newline characters. The task_id field is required.
107107
data = json.loads(line)
108108
prompts.append(data.get("prompt"))
109109
media_paths.append(data.get("media_paths", None))

tensorrt_llm/commands/bench.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from pathlib import Path
2+
from typing import Optional
23

34
import click
45

@@ -37,18 +38,25 @@
3738
type=click.Choice(severity_map.keys()),
3839
default='info',
3940
help="The logging level.")
41+
@click.option("--revision",
42+
type=str,
43+
default=None,
44+
help="The revision to use for the HuggingFace model "
45+
"(branch name, tag name, or commit id).")
4046
@click.pass_context
4147
def main(
4248
ctx,
4349
model: str,
4450
model_path: Path,
4551
workspace: Path,
4652
log_level: str,
53+
revision: Optional[str],
4754
) -> None:
4855
logger.set_level(log_level)
4956
ctx.obj = BenchmarkEnvironment(model=model,
5057
checkpoint_path=model_path,
51-
workspace=workspace)
58+
workspace=workspace,
59+
revision=revision)
5260

5361
# Create the workspace where we plan to store intermediate files.
5462
ctx.obj.workspace.mkdir(parents=True, exist_ok=True)

tensorrt_llm/commands/eval.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@
9292
is_flag=True,
9393
default=False,
9494
help="Flag for HF transformers.")
95+
@click.option("--revision",
96+
type=str,
97+
default=None,
98+
help="The revision to use for the HuggingFace model "
99+
"(branch name, tag name, or commit id).")
95100
@click.option("--extra_llm_api_options",
96101
type=str,
97102
default=None,
@@ -106,7 +111,8 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
106111
max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
107112
ep_size: Optional[int], gpus_per_node: Optional[int],
108113
kv_cache_free_gpu_memory_fraction: float, trust_remote_code: bool,
109-
extra_llm_api_options: Optional[str], disable_kv_cache_reuse: bool):
114+
revision: Optional[str], extra_llm_api_options: Optional[str],
115+
disable_kv_cache_reuse: bool):
110116
logger.set_level(log_level)
111117
build_config = BuildConfig(max_batch_size=max_batch_size,
112118
max_num_tokens=max_num_tokens,
@@ -125,6 +131,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
125131
"moe_expert_parallel_size": ep_size,
126132
"gpus_per_node": gpus_per_node,
127133
"trust_remote_code": trust_remote_code,
134+
"revision": revision,
128135
"build_config": build_config,
129136
"kv_cache_config": kv_cache_config,
130137
}

tensorrt_llm/commands/serve.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def get_llm_args(
9595
free_gpu_memory_fraction: float = 0.9,
9696
num_postprocess_workers: int = 0,
9797
trust_remote_code: bool = False,
98+
revision: Optional[str] = None,
9899
reasoning_parser: Optional[str] = None,
99100
fail_fast_on_attention_window_too_large: bool = False,
100101
otlp_traces_endpoint: Optional[str] = None,
@@ -129,6 +130,7 @@ def get_llm_args(
129130
"moe_expert_parallel_size": moe_expert_parallel_size,
130131
"gpus_per_node": gpus_per_node,
131132
"trust_remote_code": trust_remote_code,
133+
"revision": revision,
132134
"build_config": build_config,
133135
"max_batch_size": max_batch_size,
134136
"max_num_tokens": max_num_tokens,
@@ -317,6 +319,11 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
317319
is_flag=True,
318320
default=False,
319321
help="Flag for HF transformers.")
322+
@click.option("--revision",
323+
type=str,
324+
default=None,
325+
help="The revision to use for the HuggingFace model "
326+
"(branch name, tag name, or commit id).")
320327
@click.option(
321328
"--extra_llm_api_options",
322329
type=str,
@@ -381,9 +388,9 @@ def serve(
381388
ep_size: Optional[int], cluster_size: Optional[int],
382389
gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
383390
num_postprocess_workers: int, trust_remote_code: bool,
384-
extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
385-
tool_parser: Optional[str], metadata_server_config_file: Optional[str],
386-
server_role: Optional[str],
391+
revision: Optional[str], extra_llm_api_options: Optional[str],
392+
reasoning_parser: Optional[str], tool_parser: Optional[str],
393+
metadata_server_config_file: Optional[str], server_role: Optional[str],
387394
fail_fast_on_attention_window_too_large: bool,
388395
otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
389396
disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
@@ -418,6 +425,7 @@ def serve(
418425
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
419426
num_postprocess_workers=num_postprocess_workers,
420427
trust_remote_code=trust_remote_code,
428+
revision=revision,
421429
reasoning_parser=reasoning_parser,
422430
fail_fast_on_attention_window_too_large=
423431
fail_fast_on_attention_window_too_large,

0 commit comments

Comments
 (0)