Skip to content

Commit f4b3f73

Browse files
committed
Fix test
Signed-off-by: Chenjie Luo <[email protected]>
1 parent 3b3d08b commit f4b3f73

File tree

7 files changed

+35
-32
lines changed

7 files changed

+35
-32
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ Model Optimizer Changelog (Linux)
66

77
**Deprecations**
88
- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
9-
109
- TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
1110
- ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
11+
- ``int8_sq`` quantization format is deprecated from the ``examples/vlm_ptq`` respect to the TensorRT-LLM's torch backend switch.
1212
- ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
1313

1414
**Bug Fixes**

examples/llm_ptq/hf_ptq.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,11 @@ def output_decode(generated_ids, input_shape):
583583
setattr(model.config, "architectures", full_model_config.architectures)
584584

585585
start_time = time.time()
586-
if model_type in ["t5", "bart", "whisper"] or args.sparsity_fmt != "dense":
586+
if (
587+
model_type in ["t5", "bart", "whisper"]
588+
or args.sparsity_fmt != "dense"
589+
or "int8_sq" in args.qformat
590+
):
587591
warnings.warn(
588592
"Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime."
589593
)
@@ -604,6 +608,12 @@ def output_decode(generated_ids, input_shape):
604608
f"Sparsity format {args.sparsity_fmt} not supported by unified export api."
605609
)
606610

611+
if args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1:
612+
warnings.warn(
613+
"Unified HF export format does not specify inference tensor parallel or pipeline parallel. "
614+
"They will be set at deployment time."
615+
)
616+
607617
export_hf_checkpoint(
608618
full_model,
609619
export_dir=export_path,

examples/llm_ptq/scripts/huggingface_example.sh

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,6 @@ if [ -z "$MODEL_PATH" ]; then
3434
exit 1
3535
fi
3636

37-
#Iterate over list of qformats provided and check if they are supported in HF export path
38-
IFS=","
39-
for qformat in $QFORMAT; do
40-
case $qformat in
41-
fp16 | bf16 | fp8 | fp8_pc_pt | fp8_pb_wo | int4_awq | nvfp4 | nvfp4_awq | w4a8_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;;
42-
*)
43-
echo "Unsupported quant argument: Expected one of: [fp16, bf16, fp8, fp8_pc_pt, fp8_pb_wo, int4_awq, nvfp4, nvfp4_awq, w4a8_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2
44-
exit 1
45-
;;
46-
esac
47-
done
48-
IFS=" "
49-
5037
# Check if ENABLE_SPARSITY environment variable is set to "true"
5138
if [ "$SPARSITY_FMT" = "dense" ]; then
5239
ENABLE_SPARSITY=false
@@ -75,8 +62,6 @@ for qformat in $QFORMAT; do
7562
done
7663
IFS=" "
7764

78-
echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
79-
8065
script_dir="$(dirname "$(readlink -f "$0")")"
8166

8267
pushd $script_dir/..
@@ -165,6 +150,8 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
165150
--qformat="${QFORMAT// /,}" \
166151
--calib_size=$CALIB_SIZE \
167152
--batch_size=$CALIB_BATCH_SIZE \
153+
--inference_tensor_parallel=$TP \
154+
--inference_pipeline_parallel=$PP \
168155
$PTQ_ARGS \
169156
$AWQ_ARGS
170157
else
@@ -191,7 +178,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
191178
fi
192179
fi
193180

194-
if [[ ! " fp8 nvfp4 bf16 fp16 int4_awq w4a8_awq " =~ " ${QFORMAT} " ]]; then
181+
if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then
195182
echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
196183
exit 0
197184
fi
@@ -238,6 +225,8 @@ if [[ $TASKS =~ "lm_eval" ]]; then
238225

239226
pip install -r requirements.txt
240227

228+
echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
229+
241230
python lm_eval_tensorrt_llm.py \
242231
--model trt-llm \
243232
--model_args tokenizer=$MODEL_PATH,engine_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \
@@ -313,6 +302,7 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
313302
pushd ../llm_eval/
314303

315304
if [[ $TASKS =~ "livecodebench" ]]; then
305+
echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
316306
bash run_livecodebench.sh $MODEL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt
317307
mkdir -p $SAVE_PATH/livecodebench
318308
mv LiveCodeBench/output/$MODEL_NAME/* $SAVE_PATH/livecodebench
@@ -321,6 +311,7 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
321311
fi
322312

323313
if [[ $TASKS =~ "simple_eval" ]]; then
314+
echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
324315
bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
325316
echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt."
326317
fi

examples/llm_ptq/scripts/parser.sh

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ parse_options() {
2121
MODEL_PATH=""
2222
QFORMAT=""
2323
KV_CACHE_QUANT=""
24+
TP=1
25+
PP=1
2426
SPARSITY_FMT="dense"
2527
LM_EVAL_TASKS="mmlu,gsm8k"
2628
LM_EVAL_LIMIT=
@@ -34,20 +36,21 @@ parse_options() {
3436
USE_SEQ_DEVICE_MAP=false
3537

3638
# Parse command-line options
37-
ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,input:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
39+
ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
3840

3941
eval set -- "$ARGS"
4042
while true; do
4143
case "$1" in
4244
--model ) MODEL_PATH="$2"; shift 2;;
4345
--quant ) QFORMAT="$2"; shift 2;;
4446
--kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;;
47+
--tp ) TP="$2"; shift 2;;
48+
--pp ) PP="$2"; shift 2;;
4549
--sparsity ) SPARSITY_FMT="$2"; shift 2;;
4650
--awq_block_size ) AWQ_BLOCK_SIZE="$2"; shift 2;;
4751
--calib ) CALIB_SIZE="$2"; shift 2;;
4852
--calib_batch_size ) CALIB_BATCH_SIZE="$2"; shift 2;;
4953
--auto_quantize_bits ) AUTO_QUANTIZE_BITS="$2"; shift 2;;
50-
--input ) BUILD_MAX_INPUT_LEN="$2"; shift 2;;
5154
--output ) BUILD_MAX_OUTPUT_LEN="$2"; shift 2;;
5255
--batch ) BUILD_MAX_BATCH_SIZE="$2"; shift 2;;
5356
--tasks ) TASKS="$2"; shift 2;;
@@ -68,7 +71,6 @@ parse_options() {
6871

6972
DEFAULT_CALIB_SIZE=512
7073
DEFAULT_CALIB_BATCH_SIZE=0
71-
DEFAULT_BUILD_MAX_INPUT_LEN=4096
7274
DEFAULT_BUILD_MAX_OUTPUT_LEN=1024
7375
DEFAULT_BUILD_MAX_BATCH_SIZE=2
7476

@@ -78,9 +80,6 @@ parse_options() {
7880
if [ -z "$CALIB_BATCH_SIZE" ]; then
7981
CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE
8082
fi
81-
if [ -z "$BUILD_MAX_INPUT_LEN" ]; then
82-
BUILD_MAX_INPUT_LEN=$DEFAULT_BUILD_MAX_INPUT_LEN
83-
fi
8483
if [ -z "$BUILD_MAX_OUTPUT_LEN" ]; then
8584
BUILD_MAX_OUTPUT_LEN=$DEFAULT_BUILD_MAX_OUTPUT_LEN
8685
fi
@@ -125,6 +124,8 @@ parse_options() {
125124
echo "================="
126125
echo "model: $MODEL_PATH"
127126
echo "quant: $QFORMAT"
127+
echo "tp (TensorRT-LLM Checkpoint only): $TP"
128+
echo "pp (TensorRT-LLM Checkpoint only): $PP"
128129
echo "sparsity: $SPARSITY_FMT"
129130
echo "awq_block_size: $AWQ_BLOCK_SIZE"
130131
echo "calib: $CALIB_SIZE"

examples/vlm_ptq/scripts/huggingface_example.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ if [ -z "$MODEL_PATH" ]; then
3535
fi
3636

3737
case $QFORMAT in
38-
fp8|int8_sq|int4_awq|w4a8_awq|nvfp4)
38+
fp8|int4_awq|w4a8_awq|nvfp4)
3939
;;
4040
*)
41-
echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, nvfp4]" >&2
41+
echo "Unknown quant argument: Expected one of: [fp8, int4_awq, w4a8_awq, nvfp4]" >&2
4242
exit 1
4343
esac
4444

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ disable_error_code = ["attr-defined"]
116116
# Default additional options
117117
# Show a short test summary info for all except passed tests with -ra flag
118118
# print execution time for 20 slowest tests and generate coverage reports
119-
addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
119+
#addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
120120
pythonpath = ["tests/"]
121121
markers = ["manual: Only run when --run-manual is given"]
122122

tests/examples/llm_ptq/test_llm_ptq.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_ptq_t5(self, command):
4747
@pytest.mark.parametrize(
4848
"command",
4949
[
50-
PTQCommand(quant="fp8", min_sm=89),
50+
PTQCommand(quant="fp8", min_sm=90),
5151
],
5252
ids=PTQCommand.param_str,
5353
)
@@ -77,7 +77,9 @@ def test_ptq_whisper(self, command):
7777
"command",
7878
[
7979
PTQCommand(quant="int8_sq", kv_cache_quant="none"),
80+
PTQCommand(quant="int8_sq", kv_cache_quant="none", tp=2, pp=2),
8081
PTQCommand(quant="int4_awq", kv_cache_quant="none"),
82+
PTQCommand(quant="w4a8_awq", kv_cache_quant="none"),
8183
PTQCommand(quant="nvfp4"),
8284
PTQCommand(quant="nvfp4_awq"),
8385
# autoquant
@@ -104,13 +106,12 @@ def test_ptq_whisper(self, command):
104106
),
105107
# sm89
106108
PTQCommand(quant="fp8", min_sm=89),
107-
PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89),
108-
PTQCommand(quant="w4a8_awq", kv_cache_quant="none", min_sm=89, max_sm=89),
109-
# sm100
109+
PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89), # sm100
110110
PTQCommand(quant="nvfp4", min_sm=100),
111111
#
112112
# multi_gpu
113-
PTQCommand(quant="nvfp4", min_gpu=2),
113+
PTQCommand(quant="fp8", min_gpu=2, min_sm=89),
114+
PTQCommand(quant="nvfp4", min_gpu=2, min_sm=100),
114115
],
115116
ids=PTQCommand.param_str,
116117
)

0 commit comments

Comments
 (0)