Fix test

cjluo-nv · cjluo-nv · commit f4b3f737cb67 · 2025-09-17T18:44:09.000Z
Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,9 +6,9 @@ Model Optimizer Changelog (Linux)
 
 **Deprecations**
 - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
-
 - TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
 - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
+- ``int8_sq`` quantization format is deprecated from the ``examples/vlm_ptq`` respect to the TensorRT-LLM's torch backend switch.
 - ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
 
 **Bug Fixes**
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -583,7 +583,11 @@ def output_decode(generated_ids, input_shape):
             setattr(model.config, "architectures", full_model_config.architectures)
 
         start_time = time.time()
-        if model_type in ["t5", "bart", "whisper"] or args.sparsity_fmt != "dense":
+        if (
+            model_type in ["t5", "bart", "whisper"]
+            or args.sparsity_fmt != "dense"
+            or "int8_sq" in args.qformat
+        ):
             warnings.warn(
                 "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime."
             )
@@ -604,6 +608,12 @@ def output_decode(generated_ids, input_shape):
                 f"Sparsity format {args.sparsity_fmt} not supported by unified export api."
             )
 
+            if args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1:
+                warnings.warn(
+                    "Unified HF export format does not specify inference tensor parallel or pipeline parallel. "
+                    "They will be set at deployment time."
+                )
+
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -34,19 +34,6 @@ if [ -z "$MODEL_PATH" ]; then
     exit 1
 fi
 
-#Iterate over list of qformats provided and check if they are supported in HF export path
-IFS=","
-for qformat in $QFORMAT; do
-    case $qformat in
-    fp16 | bf16 | fp8 | fp8_pc_pt | fp8_pb_wo | int4_awq | nvfp4 | nvfp4_awq | w4a8_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;;
-    *)
-        echo "Unsupported quant argument: Expected one of: [fp16, bf16, fp8, fp8_pc_pt, fp8_pb_wo, int4_awq, nvfp4, nvfp4_awq, w4a8_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2
-        exit 1
-        ;;
-    esac
-done
-IFS=" "
-
 # Check if ENABLE_SPARSITY environment variable is set to "true"
 if [ "$SPARSITY_FMT" = "dense" ]; then
     ENABLE_SPARSITY=false
@@ -75,8 +62,6 @@ for qformat in $QFORMAT; do
 done
 IFS=" "
 
-echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
-
 script_dir="$(dirname "$(readlink -f "$0")")"
 
 pushd $script_dir/..
@@ -165,6 +150,8 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
             --qformat="${QFORMAT// /,}" \
             --calib_size=$CALIB_SIZE \
             --batch_size=$CALIB_BATCH_SIZE \
+            --inference_tensor_parallel=$TP \
+            --inference_pipeline_parallel=$PP \
             $PTQ_ARGS \
             $AWQ_ARGS
     else
@@ -191,7 +178,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         fi
     fi
 
-    if [[ ! " fp8 nvfp4 bf16 fp16 int4_awq w4a8_awq " =~ " ${QFORMAT} " ]]; then
+    if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then
         echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
         exit 0
     fi
@@ -238,6 +225,8 @@ if [[ $TASKS =~ "lm_eval" ]]; then
 
     pip install -r requirements.txt
 
+    echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
+
     python lm_eval_tensorrt_llm.py \
         --model trt-llm \
         --model_args tokenizer=$MODEL_PATH,engine_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \
@@ -313,6 +302,7 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
     pushd ../llm_eval/
 
     if [[ $TASKS =~ "livecodebench" ]]; then
+        echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
         bash run_livecodebench.sh $MODEL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt
         mkdir -p $SAVE_PATH/livecodebench
         mv LiveCodeBench/output/$MODEL_NAME/* $SAVE_PATH/livecodebench
@@ -321,6 +311,7 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
     fi
 
     if [[ $TASKS =~ "simple_eval" ]]; then
+        echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
         bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
         echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt."
     fi
diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh
@@ -21,6 +21,8 @@ parse_options() {
     MODEL_PATH=""
     QFORMAT=""
     KV_CACHE_QUANT=""
+    TP=1
+    PP=1
     SPARSITY_FMT="dense"
     LM_EVAL_TASKS="mmlu,gsm8k"
     LM_EVAL_LIMIT=
@@ -34,20 +36,21 @@ parse_options() {
     USE_SEQ_DEVICE_MAP=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,input:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
     case "$1" in
       --model ) MODEL_PATH="$2"; shift 2;;
       --quant ) QFORMAT="$2"; shift 2;;
       --kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;;
+      --tp ) TP="$2"; shift 2;;
+      --pp ) PP="$2"; shift 2;;
       --sparsity ) SPARSITY_FMT="$2"; shift 2;;
       --awq_block_size ) AWQ_BLOCK_SIZE="$2"; shift 2;;
       --calib ) CALIB_SIZE="$2"; shift 2;;
       --calib_batch_size ) CALIB_BATCH_SIZE="$2"; shift 2;;
       --auto_quantize_bits ) AUTO_QUANTIZE_BITS="$2"; shift 2;;
-      --input ) BUILD_MAX_INPUT_LEN="$2"; shift 2;;
       --output ) BUILD_MAX_OUTPUT_LEN="$2"; shift 2;;
       --batch ) BUILD_MAX_BATCH_SIZE="$2"; shift 2;;
       --tasks ) TASKS="$2"; shift 2;;
@@ -68,7 +71,6 @@ parse_options() {
 
   DEFAULT_CALIB_SIZE=512
   DEFAULT_CALIB_BATCH_SIZE=0
-  DEFAULT_BUILD_MAX_INPUT_LEN=4096
   DEFAULT_BUILD_MAX_OUTPUT_LEN=1024
   DEFAULT_BUILD_MAX_BATCH_SIZE=2
 
@@ -78,9 +80,6 @@ parse_options() {
   if [ -z "$CALIB_BATCH_SIZE" ]; then
     CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE
   fi
-  if [ -z "$BUILD_MAX_INPUT_LEN" ]; then
-    BUILD_MAX_INPUT_LEN=$DEFAULT_BUILD_MAX_INPUT_LEN
-  fi
   if [ -z "$BUILD_MAX_OUTPUT_LEN" ]; then
     BUILD_MAX_OUTPUT_LEN=$DEFAULT_BUILD_MAX_OUTPUT_LEN
   fi
@@ -125,6 +124,8 @@ parse_options() {
   echo "================="
   echo "model: $MODEL_PATH"
   echo "quant: $QFORMAT"
+  echo "tp (TensorRT-LLM Checkpoint only): $TP"
+  echo "pp (TensorRT-LLM Checkpoint only): $PP"
   echo "sparsity: $SPARSITY_FMT"
   echo "awq_block_size: $AWQ_BLOCK_SIZE"
   echo "calib: $CALIB_SIZE"
diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh
@@ -35,10 +35,10 @@ if [ -z "$MODEL_PATH" ]; then
 fi
 
 case $QFORMAT in
-    fp8|int8_sq|int4_awq|w4a8_awq|nvfp4)
+    fp8|int4_awq|w4a8_awq|nvfp4)
         ;;
     *)
-        echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, nvfp4]" >&2
+        echo "Unknown quant argument: Expected one of: [fp8, int4_awq, w4a8_awq, nvfp4]" >&2
         exit 1
 esac
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -116,7 +116,7 @@ disable_error_code = ["attr-defined"]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
 # print execution time for 20 slowest tests and generate coverage reports
-addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+#addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
 pythonpath = ["tests/"]
 markers = ["manual: Only run when --run-manual is given"]
 
diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py
@@ -47,7 +47,7 @@ def test_ptq_t5(self, command):
 @pytest.mark.parametrize(
     "command",
     [
-        PTQCommand(quant="fp8", min_sm=89),
+        PTQCommand(quant="fp8", min_sm=90),
     ],
     ids=PTQCommand.param_str,
 )
@@ -77,7 +77,9 @@ def test_ptq_whisper(self, command):
     "command",
     [
         PTQCommand(quant="int8_sq", kv_cache_quant="none"),
+        PTQCommand(quant="int8_sq", kv_cache_quant="none", tp=2, pp=2),
         PTQCommand(quant="int4_awq", kv_cache_quant="none"),
+        PTQCommand(quant="w4a8_awq", kv_cache_quant="none"),
         PTQCommand(quant="nvfp4"),
         PTQCommand(quant="nvfp4_awq"),
         # autoquant
@@ -104,13 +106,12 @@ def test_ptq_whisper(self, command):
         ),
         # sm89
         PTQCommand(quant="fp8", min_sm=89),
-        PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89),
-        PTQCommand(quant="w4a8_awq", kv_cache_quant="none", min_sm=89, max_sm=89),
-        # sm100
+        PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89),  # sm100
         PTQCommand(quant="nvfp4", min_sm=100),
         #
         # multi_gpu
-        PTQCommand(quant="nvfp4", min_gpu=2),
+        PTQCommand(quant="fp8", min_gpu=2, min_sm=89),
+        PTQCommand(quant="nvfp4", min_gpu=2, min_sm=100),
     ],
     ids=PTQCommand.param_str,
 )