Update

cjluo-nv · cjluo-nv · commit ed6e98b4abe8 · 2025-09-16T21:45:59.000Z
Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,7 +6,7 @@ Model Optimizer Changelog (Linux)
 
 **Deprecations**
 
-- TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``.
+- TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
 - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
 - ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
 
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
@@ -203,7 +203,7 @@ scripts/huggingface_example.sh --type llama --model $HF_PATH --quant w4a8_awq,fp
 The above example perform `AutoQuantize` where the less quantization accuracy sensitive layers are quantized with `w4a8_awq` (specified by `--quant w4a8_awq`) and the more sensitive layers
 are kept un-quantized such that the effective bits is 4.8 (specified by `--auto_quantize_bits 4.8`).
 
-The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `build,mmlu,benchmark,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks).
+The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `quant,mmlu,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks).
 
 > *If GPU out-of-memory error is reported running the scripts, please try editing the scripts and reducing the max batch size to save GPU memory.*
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -584,25 +584,16 @@ def output_decode(generated_ids, input_shape):
 
         start_time = time.time()
         if model_type in ["t5", "bart", "whisper"] or args.sparsity_fmt != "dense":
-            # Still export TensorRT-LLM checkpoints for the models not supported by the
-            # TensorRT-LLM torch runtime.
+            warnings.warn(
+                "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime."
+            )
 
             # Move meta tensor back to device before exporting.
             remove_hook_from_module(model, recurse=True)
 
-            dtype = None
-            if "w4a8_awq" in args.qformat:
-                # TensorRT-LLM w4a8 only support fp16 as the dtype.
-                dtype = torch.float16
-
-            # For Gemma2-27B, TRT-LLM only works with bfloat16 as the dtype.
-            if model_type == "gemma2":
-                dtype = torch.bfloat16
-
             export_tensorrt_llm_checkpoint(
                 model,
                 model_type,
-                dtype=dtype,
                 export_dir=export_path,
                 inference_tensor_parallel=args.inference_tensor_parallel,
                 inference_pipeline_parallel=args.inference_pipeline_parallel,
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -87,11 +87,9 @@ fi
 
 QFORMAT_MODIFIED="${QFORMAT//,/_}"
 
-MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')
+MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
 
-MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_${EXPORT_FORMAT}
-
-SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME}
+SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}
 
 MODEL_CONFIG=${SAVE_PATH}/config.json
 
@@ -188,13 +186,13 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
 
         if [ "$cuda_major" -lt 10 ]; then
-            echo "Please build the tensorrt_llm engine on Blackwell GPU for deployment. Checkpoint export_path: $SAVE_PATH"
+            echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH"
             exit 0
         fi
     fi
 
     if [[ ! " fp8 nvfp4 bf16 fp16 int4_awq w4a8_awq " =~ " ${QFORMAT} " ]]; then
-        echo "Quant $QFORMAT not supported with the TensorRT-LLM torch llmapi. Allowed values are: fp8, nvfp4, bf16, fp16, int4_awq, w4a8_awq"
+        echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
         exit 0
     fi
 
@@ -315,15 +313,15 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
     pushd ../llm_eval/
 
     if [[ $TASKS =~ "livecodebench" ]]; then
-        bash run_livecodebench.sh $MODEL_FULL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt
+        bash run_livecodebench.sh $MODEL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt
         mkdir -p $SAVE_PATH/livecodebench
-        mv LiveCodeBench/output/$MODEL_FULL_NAME/* $SAVE_PATH/livecodebench
+        mv LiveCodeBench/output/$MODEL_NAME/* $SAVE_PATH/livecodebench
         echo "LiveCodeBench results are saved under $SAVE_PATH/livecodebench."
 
     fi
 
     if [[ $TASKS =~ "simple_eval" ]]; then
-        bash run_simple_eval.sh $MODEL_FULL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
+        bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
         echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt."
     fi
 
@@ -332,61 +330,5 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
     kill $SERVE_PID
 fi
 
-if [[ $TASKS =~ "benchmark" ]]; then
-
-    if [ "$PP" -ne 1 ]; then
-        echo "Benchmark does not work with multi PP. Please run the c++ benchmark in the TensorRT-LLM repo..."
-        exit 1
-    fi
-
-    BENCHMARK_RESULT=${SAVE_PATH}/benchmark.txt
-    echo "Evaluating performance, result saved to $BENCHMARK_RESULT..."
-
-    # Prepare datasets for TRT-LLM benchmark
-    if [ -z "$TRT_LLM_CODE_PATH" ]; then
-        TRT_LLM_CODE_PATH=/app/tensorrt_llm
-        echo "Setting default TRT_LLM_CODE_PATH to $TRT_LLM_CODE_PATH."
-    fi
-
-    # Synthesize the tokenized benchmarking dataset
-    TRT_LLM_PREPARE_DATASET=$TRT_LLM_CODE_PATH/benchmarks/cpp/prepare_dataset.py
-
-    # Align with the official benchmark
-    BENCHMARK_INPUT_LEN=$BUILD_MAX_INPUT_LEN
-    BENCHMARK_OUTPUT_LEN=$BUILD_MAX_OUTPUT_LEN
-    BENCHMARK_NUM_REQUESTS=256
-
-    DATASET_TXT=${SAVE_PATH}/synthetic_${BENCHMARK_INPUT_LEN}_${BENCHMARK_OUTPUT_LEN}_${BENCHMARK_NUM_REQUESTS}.txt
-
-    if [ -z "$TRT_LLM_PREPARE_DATASET" ]; then
-        echo "Unable to prepare dataset for benchmarking. Please set TRT_LLM_CODE_PATH to the TRT-LLM code path."
-    else
-        if ! [ -f $DATASET_TXT ]; then
-            python $TRT_LLM_PREPARE_DATASET --stdout --tokenizer $MODEL_PATH token-norm-dist \
-                --input-mean $BENCHMARK_INPUT_LEN --output-mean $BENCHMARK_OUTPUT_LEN --input-stdev 0 --output-stdev 0 \
-                --num-requests $BENCHMARK_NUM_REQUESTS >$DATASET_TXT
-        else
-            echo "Use existing benchmark dataset in $DATASET_TXT."
-        fi
-    fi
-
-    MODEL_ARGS="--model_path $SAVE_PATH "
-    EXTRA_ARGS="--backend pytorch "
-
-    if [ "$BUILD_MAX_BATCH_SIZE" -gt 1 ]; then
-        trtllm-bench --model $MODEL_PATH $MODEL_ARGS throughput $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
-    else
-        trtllm-bench --model $MODEL_PATH $MODEL_ARGS latency $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
-    fi
-
-fi
-
-if [ -n "$FREE_SPACE" ]; then
-    rm -f $SAVE_PATH/*.json
-    rm -f $SAVE_PATH/*.safetensors
-    rm -f $SAVE_PATH/*/*.json
-    rm -f $SAVE_PATH/*/*.engine
-    rm -f $SAVE_PATH/*/*.cache
-fi
 
 popd
diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh
@@ -54,7 +54,6 @@ parse_options() {
       --lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;;
       --lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
       --simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
-      --num_samples ) NUM_SAMPLES="$2"; shift 2;;
       --trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
       --use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
       --gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;
@@ -96,7 +95,7 @@ parse_options() {
     exit 1
   fi
 
-  VALID_TASKS=("quant" "mmlu" "mtbench" "benchmark" "lm_eval" "livecodebench" "simple_eval")
+  VALID_TASKS=("quant" "mmlu" "mtbench" "lm_eval" "livecodebench" "simple_eval")
 
   for task in $(echo "$TASKS" | tr ',' ' '); do
     is_valid_task=false
diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh
@@ -35,10 +35,10 @@ if [ -z "$MODEL_PATH" ]; then
 fi
 
 case $QFORMAT in
-    fp8|int8_sq|int4_awq|w4a8_awq|fp16|bf16|nvfp4)
+    fp8|int8_sq|int4_awq|w4a8_awq|nvfp4)
         ;;
     *)
-        echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4]" >&2
+        echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, nvfp4]" >&2
         exit 1
 esac
 
@@ -50,8 +50,8 @@ if [ -z "$ROOT_SAVE_PATH" ]; then
     ROOT_SAVE_PATH=$(pwd)
 fi
 
-MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')
-SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}_${QFORMAT}
+MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
+SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}
 
 MODEL_CONFIG=${SAVE_PATH}/config.json
 
diff --git a/modelopt/deploy/llm/generate.py b/modelopt/deploy/llm/generate.py
@@ -30,7 +30,6 @@
     from tensorrt_llm.llmapi import CudaGraphConfig
     from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
     from tensorrt_llm.llmapi.llm import LLM as TRTLLM
-    from tensorrt_llm.llmapi.tokenizer import TokenizerBase
 except ImportError:
     print("Please upgrade tensorrt-llm to 1.1.0rc2 or later")
     raise
@@ -57,7 +56,7 @@ class LLM(TRTLLM):
     def __init__(
         self,
         checkpoint_dir: str | Path,
-        tokenizer: "str | Path | TokenizerBase | None" = None,
+        tokenizer: "str | Path | None" = None,
         kv_cache_config: dict[str, int | float] = {},
         medusa_choices: Any = None,
         tp: int = 0,
@@ -67,7 +66,7 @@ def __init__(
         """Initializes the LLM runner class.
 
         Args:
-            engine_dir: the directory path of the TensorRT-LLM engine.
+            checkpoint_dir: the directory path of the model checkpoint.
             tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model.
             kv_cache_config: the kv cache config as a dict. Please refer to
                 https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/
@@ -112,7 +111,7 @@ def _find_max_position_embeddings(cfg: dict) -> int | None:
         # Check if any key in config contains both "num" and "experts"
         ep = 1
         enable_attention_dp = False
-        for k in config.keys():
+        for k in config:
             if "num" in k and "experts" in k:
                 ep = torch.cuda.device_count()
                 enable_attention_dp = True
diff --git a/pyproject.toml b/pyproject.toml
@@ -116,7 +116,7 @@ disable_error_code = ["attr-defined"]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
 # print execution time for 20 slowest tests and generate coverage reports
-addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+# addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
 pythonpath = ["tests/"]
 markers = ["manual: Only run when --run-manual is given"]
 
diff --git a/tests/_test_utils/examples/run_command.py b/tests/_test_utils/examples/run_command.py
@@ -117,16 +117,16 @@ def run_llm_export_command(
 
 def run_llm_ptq_command(*, model: str, quant: str, **kwargs):
     kwargs.update({"model": model, "quant": quant})
-    kwargs.setdefault("tasks", "build")
+    kwargs.setdefault("tasks", "quant")
     kwargs.setdefault("calib", 16)
 
     cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh", "--no-verbose"], **kwargs)
     run_example_command(cmd_parts, "llm_ptq")
 
 
-def run_vlm_ptq_command(*, model: str, type: str, quant: str, **kwargs):
-    kwargs.update({"model": model, "type": type, "quant": quant})
-    kwargs.setdefault("tasks", "build")
+def run_vlm_ptq_command(*, model: str, quant: str, **kwargs):
+    kwargs.update({"model": model, "quant": quant})
+    kwargs.setdefault("tasks", "quant")
     kwargs.setdefault("calib", 16)
 
     cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh"], **kwargs)
diff --git a/tests/_test_utils/model.py b/tests/_test_utils/model.py
@@ -63,6 +63,11 @@ def _select_path(remote_id: str, local_id: str) -> str:
     local_id="llava-1.5-7b-hf",
 )
 
+QWEN_VL_PATH = _select_path(
+    remote_id="Qwen/Qwen2-VL-2B-Instruct",
+    local_id="Qwen2-VL-2B-Instruct",
+)
+
 # Diffusers
 FLUX_SCHNELL_PATH = _select_path(
     remote_id="hf-internal-testing/tiny-flux-pipe",
diff --git a/tests/_test_utils/ptq_utils.py b/tests/_test_utils/ptq_utils.py
@@ -27,8 +27,7 @@
 @dataclass
 class PTQCommand:
     quant: str
-    export_fmt: str = "tensorrt_llm"
-    tasks: str = "build"
+    tasks: str = "quant"
     calib: int = 16
     sparsity: str | None = None
     kv_cache_quant: str | None = None
@@ -38,7 +37,9 @@ class PTQCommand:
     tp: int | None = None
     pp: int | None = None
     min_sm: int | None = None
+    max_sm: int | None = None
     min_gpu: int | None = None
+    batch: int | None = None
 
     def run(self, model_path: str):
         if self.min_sm and torch.cuda.get_device_capability() < (
@@ -48,6 +49,13 @@ def run(self, model_path: str):
             pytest.skip(reason=f"Requires sm{self.min_sm} or higher")
             return
 
+        if self.max_sm and torch.cuda.get_device_capability() > (
+            self.max_sm // 10,
+            self.max_sm % 10,
+        ):
+            pytest.skip(reason=f"Requires sm{self.max_sm} or lower")
+            return
+
         if self.min_gpu and torch.cuda.device_count() < self.min_gpu:
             pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs")
             return
diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py
@@ -16,20 +16,22 @@
 import subprocess
 
 from _test_utils.examples.run_command import run_llm_ptq_command
+from _test_utils.model import TINY_LLAMA_PATH
 from _test_utils.torch_misc import minimum_sm
 
 
 @minimum_sm(89)
-def test_llama_eval_fp8(tiny_llama_path):
+def test_llama_eval_fp8():
     try:
         run_llm_ptq_command(
-            model=tiny_llama_path,
+            model=TINY_LLAMA_PATH,
             quant="fp8",
-            tasks="mmlu,lm_eval,simple_eval,benchmark",
+            tasks="mmlu,lm_eval,simple_eval",
             calib=64,
             lm_eval_tasks="hellaswag,gsm8k",
             simple_eval_tasks="humaneval",
             lm_eval_limit=0.1,
+            batch=8,
         )
     finally:
         # Force kill llm-serve if it's still running
diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py
diff --git a/tests/examples/vlm_ptq/test_qwen_vl.py b/tests/examples/vlm_ptq/test_qwen_vl.py