Skip to content

Commit ed6e98b

Browse files
committed
Update
Signed-off-by: Chenjie Luo <[email protected]>
1 parent 92e6900 commit ed6e98b

File tree

14 files changed

+64
-143
lines changed

14 files changed

+64
-143
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Model Optimizer Changelog (Linux)
66

77
**Deprecations**
88

9-
- TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``.
9+
- TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
1010
- ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
1111
- ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
1212

examples/llm_ptq/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ scripts/huggingface_example.sh --type llama --model $HF_PATH --quant w4a8_awq,fp
203203
The above example perform `AutoQuantize` where the less quantization accuracy sensitive layers are quantized with `w4a8_awq` (specified by `--quant w4a8_awq`) and the more sensitive layers
204204
are kept un-quantized such that the effective bits is 4.8 (specified by `--auto_quantize_bits 4.8`).
205205

206-
The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `build,mmlu,benchmark,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks).
206+
The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `quant,mmlu,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks).
207207

208208
> *If GPU out-of-memory error is reported running the scripts, please try editing the scripts and reducing the max batch size to save GPU memory.*
209209

examples/llm_ptq/hf_ptq.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -584,25 +584,16 @@ def output_decode(generated_ids, input_shape):
584584

585585
start_time = time.time()
586586
if model_type in ["t5", "bart", "whisper"] or args.sparsity_fmt != "dense":
587-
# Still export TensorRT-LLM checkpoints for the models not supported by the
588-
# TensorRT-LLM torch runtime.
587+
warnings.warn(
588+
"Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime."
589+
)
589590

590591
# Move meta tensor back to device before exporting.
591592
remove_hook_from_module(model, recurse=True)
592593

593-
dtype = None
594-
if "w4a8_awq" in args.qformat:
595-
# TensorRT-LLM w4a8 only support fp16 as the dtype.
596-
dtype = torch.float16
597-
598-
# For Gemma2-27B, TRT-LLM only works with bfloat16 as the dtype.
599-
if model_type == "gemma2":
600-
dtype = torch.bfloat16
601-
602594
export_tensorrt_llm_checkpoint(
603595
model,
604596
model_type,
605-
dtype=dtype,
606597
export_dir=export_path,
607598
inference_tensor_parallel=args.inference_tensor_parallel,
608599
inference_pipeline_parallel=args.inference_pipeline_parallel,

examples/llm_ptq/scripts/huggingface_example.sh

Lines changed: 7 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,9 @@ fi
8787

8888
QFORMAT_MODIFIED="${QFORMAT//,/_}"
8989

90-
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')
90+
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
9191

92-
MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_${EXPORT_FORMAT}
93-
94-
SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME}
92+
SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}
9593

9694
MODEL_CONFIG=${SAVE_PATH}/config.json
9795

@@ -188,13 +186,13 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
188186
cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
189187

190188
if [ "$cuda_major" -lt 10 ]; then
191-
echo "Please build the tensorrt_llm engine on Blackwell GPU for deployment. Checkpoint export_path: $SAVE_PATH"
189+
echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH"
192190
exit 0
193191
fi
194192
fi
195193

196194
if [[ ! " fp8 nvfp4 bf16 fp16 int4_awq w4a8_awq " =~ " ${QFORMAT} " ]]; then
197-
echo "Quant $QFORMAT not supported with the TensorRT-LLM torch llmapi. Allowed values are: fp8, nvfp4, bf16, fp16, int4_awq, w4a8_awq"
195+
echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
198196
exit 0
199197
fi
200198

@@ -315,15 +313,15 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
315313
pushd ../llm_eval/
316314

317315
if [[ $TASKS =~ "livecodebench" ]]; then
318-
bash run_livecodebench.sh $MODEL_FULL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt
316+
bash run_livecodebench.sh $MODEL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt
319317
mkdir -p $SAVE_PATH/livecodebench
320-
mv LiveCodeBench/output/$MODEL_FULL_NAME/* $SAVE_PATH/livecodebench
318+
mv LiveCodeBench/output/$MODEL_NAME/* $SAVE_PATH/livecodebench
321319
echo "LiveCodeBench results are saved under $SAVE_PATH/livecodebench."
322320

323321
fi
324322

325323
if [[ $TASKS =~ "simple_eval" ]]; then
326-
bash run_simple_eval.sh $MODEL_FULL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
324+
bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
327325
echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt."
328326
fi
329327

@@ -332,61 +330,5 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
332330
kill $SERVE_PID
333331
fi
334332

335-
if [[ $TASKS =~ "benchmark" ]]; then
336-
337-
if [ "$PP" -ne 1 ]; then
338-
echo "Benchmark does not work with multi PP. Please run the c++ benchmark in the TensorRT-LLM repo..."
339-
exit 1
340-
fi
341-
342-
BENCHMARK_RESULT=${SAVE_PATH}/benchmark.txt
343-
echo "Evaluating performance, result saved to $BENCHMARK_RESULT..."
344-
345-
# Prepare datasets for TRT-LLM benchmark
346-
if [ -z "$TRT_LLM_CODE_PATH" ]; then
347-
TRT_LLM_CODE_PATH=/app/tensorrt_llm
348-
echo "Setting default TRT_LLM_CODE_PATH to $TRT_LLM_CODE_PATH."
349-
fi
350-
351-
# Synthesize the tokenized benchmarking dataset
352-
TRT_LLM_PREPARE_DATASET=$TRT_LLM_CODE_PATH/benchmarks/cpp/prepare_dataset.py
353-
354-
# Align with the official benchmark
355-
BENCHMARK_INPUT_LEN=$BUILD_MAX_INPUT_LEN
356-
BENCHMARK_OUTPUT_LEN=$BUILD_MAX_OUTPUT_LEN
357-
BENCHMARK_NUM_REQUESTS=256
358-
359-
DATASET_TXT=${SAVE_PATH}/synthetic_${BENCHMARK_INPUT_LEN}_${BENCHMARK_OUTPUT_LEN}_${BENCHMARK_NUM_REQUESTS}.txt
360-
361-
if [ -z "$TRT_LLM_PREPARE_DATASET" ]; then
362-
echo "Unable to prepare dataset for benchmarking. Please set TRT_LLM_CODE_PATH to the TRT-LLM code path."
363-
else
364-
if ! [ -f $DATASET_TXT ]; then
365-
python $TRT_LLM_PREPARE_DATASET --stdout --tokenizer $MODEL_PATH token-norm-dist \
366-
--input-mean $BENCHMARK_INPUT_LEN --output-mean $BENCHMARK_OUTPUT_LEN --input-stdev 0 --output-stdev 0 \
367-
--num-requests $BENCHMARK_NUM_REQUESTS >$DATASET_TXT
368-
else
369-
echo "Use existing benchmark dataset in $DATASET_TXT."
370-
fi
371-
fi
372-
373-
MODEL_ARGS="--model_path $SAVE_PATH "
374-
EXTRA_ARGS="--backend pytorch "
375-
376-
if [ "$BUILD_MAX_BATCH_SIZE" -gt 1 ]; then
377-
trtllm-bench --model $MODEL_PATH $MODEL_ARGS throughput $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
378-
else
379-
trtllm-bench --model $MODEL_PATH $MODEL_ARGS latency $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
380-
fi
381-
382-
fi
383-
384-
if [ -n "$FREE_SPACE" ]; then
385-
rm -f $SAVE_PATH/*.json
386-
rm -f $SAVE_PATH/*.safetensors
387-
rm -f $SAVE_PATH/*/*.json
388-
rm -f $SAVE_PATH/*/*.engine
389-
rm -f $SAVE_PATH/*/*.cache
390-
fi
391333

392334
popd

examples/llm_ptq/scripts/parser.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ parse_options() {
5454
--lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;;
5555
--lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
5656
--simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
57-
--num_samples ) NUM_SAMPLES="$2"; shift 2;;
5857
--trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
5958
--use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
6059
--gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;
@@ -96,7 +95,7 @@ parse_options() {
9695
exit 1
9796
fi
9897

99-
VALID_TASKS=("quant" "mmlu" "mtbench" "benchmark" "lm_eval" "livecodebench" "simple_eval")
98+
VALID_TASKS=("quant" "mmlu" "mtbench" "lm_eval" "livecodebench" "simple_eval")
10099

101100
for task in $(echo "$TASKS" | tr ',' ' '); do
102101
is_valid_task=false

examples/vlm_ptq/scripts/huggingface_example.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ if [ -z "$MODEL_PATH" ]; then
3535
fi
3636

3737
case $QFORMAT in
38-
fp8|int8_sq|int4_awq|w4a8_awq|fp16|bf16|nvfp4)
38+
fp8|int8_sq|int4_awq|w4a8_awq|nvfp4)
3939
;;
4040
*)
41-
echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4]" >&2
41+
echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, nvfp4]" >&2
4242
exit 1
4343
esac
4444

@@ -50,8 +50,8 @@ if [ -z "$ROOT_SAVE_PATH" ]; then
5050
ROOT_SAVE_PATH=$(pwd)
5151
fi
5252

53-
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')
54-
SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}_${QFORMAT}
53+
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
54+
SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}
5555

5656
MODEL_CONFIG=${SAVE_PATH}/config.json
5757

modelopt/deploy/llm/generate.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from tensorrt_llm.llmapi import CudaGraphConfig
3131
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
3232
from tensorrt_llm.llmapi.llm import LLM as TRTLLM
33-
from tensorrt_llm.llmapi.tokenizer import TokenizerBase
3433
except ImportError:
3534
print("Please upgrade tensorrt-llm to 1.1.0rc2 or later")
3635
raise
@@ -57,7 +56,7 @@ class LLM(TRTLLM):
5756
def __init__(
5857
self,
5958
checkpoint_dir: str | Path,
60-
tokenizer: "str | Path | TokenizerBase | None" = None,
59+
tokenizer: "str | Path | None" = None,
6160
kv_cache_config: dict[str, int | float] = {},
6261
medusa_choices: Any = None,
6362
tp: int = 0,
@@ -67,7 +66,7 @@ def __init__(
6766
"""Initializes the LLM runner class.
6867
6968
Args:
70-
engine_dir: the directory path of the TensorRT-LLM engine.
69+
checkpoint_dir: the directory path of the model checkpoint.
7170
tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model.
7271
kv_cache_config: the kv cache config as a dict. Please refer to
7372
https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/
@@ -112,7 +111,7 @@ def _find_max_position_embeddings(cfg: dict) -> int | None:
112111
# Check if any key in config contains both "num" and "experts"
113112
ep = 1
114113
enable_attention_dp = False
115-
for k in config.keys():
114+
for k in config:
116115
if "num" in k and "experts" in k:
117116
ep = torch.cuda.device_count()
118117
enable_attention_dp = True

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ disable_error_code = ["attr-defined"]
116116
# Default additional options
117117
# Show a short test summary info for all except passed tests with -ra flag
118118
# print execution time for 20 slowest tests and generate coverage reports
119-
addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
119+
# addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
120120
pythonpath = ["tests/"]
121121
markers = ["manual: Only run when --run-manual is given"]
122122

tests/_test_utils/examples/run_command.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,16 @@ def run_llm_export_command(
117117

118118
def run_llm_ptq_command(*, model: str, quant: str, **kwargs):
119119
kwargs.update({"model": model, "quant": quant})
120-
kwargs.setdefault("tasks", "build")
120+
kwargs.setdefault("tasks", "quant")
121121
kwargs.setdefault("calib", 16)
122122

123123
cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh", "--no-verbose"], **kwargs)
124124
run_example_command(cmd_parts, "llm_ptq")
125125

126126

127-
def run_vlm_ptq_command(*, model: str, type: str, quant: str, **kwargs):
128-
kwargs.update({"model": model, "type": type, "quant": quant})
129-
kwargs.setdefault("tasks", "build")
127+
def run_vlm_ptq_command(*, model: str, quant: str, **kwargs):
128+
kwargs.update({"model": model, "quant": quant})
129+
kwargs.setdefault("tasks", "quant")
130130
kwargs.setdefault("calib", 16)
131131

132132
cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh"], **kwargs)

tests/_test_utils/model.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ def _select_path(remote_id: str, local_id: str) -> str:
6363
local_id="llava-1.5-7b-hf",
6464
)
6565

66+
QWEN_VL_PATH = _select_path(
67+
remote_id="Qwen/Qwen2-VL-2B-Instruct",
68+
local_id="Qwen2-VL-2B-Instruct",
69+
)
70+
6671
# Diffusers
6772
FLUX_SCHNELL_PATH = _select_path(
6873
remote_id="hf-internal-testing/tiny-flux-pipe",

0 commit comments

Comments
 (0)