Update

cjluo-nv · cjluo-nv · commit 29fa81c25070 · 2025-09-16T16:38:22.000Z
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -89,7 +89,7 @@ QFORMAT_MODIFIED="${QFORMAT//,/_}"
 
 MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')
 
-MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_${EXPORT_FORMAT}
+MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
 
 SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME}
 
@@ -188,13 +188,13 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
 
         if [ "$cuda_major" -lt 10 ]; then
-            echo "Please build the tensorrt_llm engine on Blackwell GPU for deployment. Checkpoint export_path: $SAVE_PATH"
+            echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH"
             exit 0
         fi
     fi
 
     if [[ ! " fp8 nvfp4 bf16 fp16 int4_awq w4a8_awq " =~ " ${QFORMAT} " ]]; then
-        echo "Quant $QFORMAT not supported with the TensorRT-LLM torch llmapi. Allowed values are: fp8, nvfp4, bf16, fp16, int4_awq, w4a8_awq"
+        echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
         exit 0
     fi
 
@@ -333,12 +333,6 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
 fi
 
 if [[ $TASKS =~ "benchmark" ]]; then
-
-    if [ "$PP" -ne 1 ]; then
-        echo "Benchmark does not work with multi PP. Please run the c++ benchmark in the TensorRT-LLM repo..."
-        exit 1
-    fi
-
     BENCHMARK_RESULT=${SAVE_PATH}/benchmark.txt
     echo "Evaluating performance, result saved to $BENCHMARK_RESULT..."
 
@@ -371,12 +365,17 @@ if [[ $TASKS =~ "benchmark" ]]; then
     fi
 
     MODEL_ARGS="--model_path $SAVE_PATH "
-    EXTRA_ARGS="--backend pytorch "
+    if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
+        VISIBLE_GPU_COUNT=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | grep -v '^$' | wc -l)
+    else
+        VISIBLE_GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    fi
+    EXTRA_ARGS="--backend pytorch --tp $VISIBLE_GPU_COUNT "
 
     if [ "$BUILD_MAX_BATCH_SIZE" -gt 1 ]; then
-        trtllm-bench --model $MODEL_PATH $MODEL_ARGS throughput $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
+        trtllm-bench throughput --model $MODEL_PATH $MODEL_ARGS $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
     else
-        trtllm-bench --model $MODEL_PATH $MODEL_ARGS latency $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
+        trtllm-bench latency --model $MODEL_PATH $MODEL_ARGS $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
     fi
 
 fi
diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh
@@ -54,7 +54,6 @@ parse_options() {
       --lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;;
       --lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
       --simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
-      --num_samples ) NUM_SAMPLES="$2"; shift 2;;
       --trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
       --use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
       --gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;
diff --git a/modelopt/deploy/llm/generate.py b/modelopt/deploy/llm/generate.py
@@ -30,7 +30,6 @@
     from tensorrt_llm.llmapi import CudaGraphConfig
     from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
     from tensorrt_llm.llmapi.llm import LLM as TRTLLM
-    from tensorrt_llm.llmapi.tokenizer import TokenizerBase
 except ImportError:
     print("Please upgrade tensorrt-llm to 1.1.0rc2 or later")
     raise
@@ -57,7 +56,7 @@ class LLM(TRTLLM):
     def __init__(
         self,
         checkpoint_dir: str | Path,
-        tokenizer: "str | Path | TokenizerBase | None" = None,
+        tokenizer: "str | Path | None" = None,
         kv_cache_config: dict[str, int | float] = {},
         medusa_choices: Any = None,
         tp: int = 0,
@@ -67,7 +66,7 @@ def __init__(
         """Initializes the LLM runner class.
 
         Args:
-            engine_dir: the directory path of the TensorRT-LLM engine.
+            checkpoint_dir: the directory path of the model checkpoint.
             tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model.
             kv_cache_config: the kv cache config as a dict. Please refer to
                 https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/
@@ -112,7 +111,7 @@ def _find_max_position_embeddings(cfg: dict) -> int | None:
         # Check if any key in config contains both "num" and "experts"
         ep = 1
         enable_attention_dp = False
-        for k in config.keys():
+        for k in config:
             if "num" in k and "experts" in k:
                 ep = torch.cuda.device_count()
                 enable_attention_dp = True
diff --git a/tests/_test_utils/model.py b/tests/_test_utils/model.py
@@ -63,6 +63,11 @@ def _select_path(remote_id: str, local_id: str) -> str:
     local_id="llava-1.5-7b-hf",
 )
 
+LLAMA3_2_PATH = _select_path(
+    remote_id="meta-llama/Llama-3.2-1B-Instruct",
+    local_id="Llama-3.2-1B-Instruct",
+)
+
 # Diffusers
 FLUX_SCHNELL_PATH = _select_path(
     remote_id="hf-internal-testing/tiny-flux-pipe",
diff --git a/tests/_test_utils/ptq_utils.py b/tests/_test_utils/ptq_utils.py
@@ -39,6 +39,7 @@ class PTQCommand:
     pp: int | None = None
     min_sm: int | None = None
     min_gpu: int | None = None
+    batch: int | None = None
 
     def run(self, model_path: str):
         if self.min_sm and torch.cuda.get_device_capability() < (
diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py
@@ -30,6 +30,7 @@ def test_llama_eval_fp8(tiny_llama_path):
             lm_eval_tasks="hellaswag,gsm8k",
             simple_eval_tasks="humaneval",
             lm_eval_limit=0.1,
+            batch=8,
         )
     finally:
         # Force kill llm-serve if it's still running
diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py
@@ -92,51 +92,49 @@ def llama_path(tiny_llama_path):
     [
         PTQCommand(quant="fp16"),
         PTQCommand(quant="bf16"),
-        PTQCommand(quant="int8_sq"),
-        # ("int8_sq", "tensorrt_llm", "sparsegpt"),
-        PTQCommand(quant="int4_awq"),
+        PTQCommand(quant="int8_sq", kv_cache_quant="none"),
+        PTQCommand(quant="int4_awq", kv_cache_quant="none"),
         PTQCommand(quant="nvfp4"),
         PTQCommand(quant="nvfp4_awq"),
-        #
         # autoquant
         PTQCommand(
             quant="int4_awq,nvfp4,fp8,w4a8_awq",
             calib_batch_size=4,
             auto_quantize_bits=6.4,
+            kv_cache_quant="none",
         ),
-        #
         # kv_cache
         PTQCommand(quant="nvfp4_awq", kv_cache_quant="nvfp4"),
-        # ("nvfp4_awq", "tensorrt_llm", "nvfp4_affine"),
-        # ("nvfp4_awq", "hf", "nvfp4_affine"),
         #
         # autoquant_kv_cache
         PTQCommand(
-            quant="int4_awq,nvfp4,fp8,w4a8_awq",
+            quant="nvfp4,fp8",
+            kv_cache_quant="fp8",
+            calib_batch_size=4,
+            auto_quantize_bits=6.4,
+        ),
+        PTQCommand(
+            quant="nvfp4,fp8",
             kv_cache_quant="nvfp4",
             calib_batch_size=4,
             auto_quantize_bits=6.4,
         ),
-        # ("int4_awq,nvfp4,fp8,w4a8_awq", "tensorrt_llm", "nvfp4_affine"),
-        # ("int4_awq,nvfp4,fp8,w4a8_awq", "hf", "nvfp4_affine"),
-        #
         # sm89
         PTQCommand(quant="fp8", min_sm=89),
         PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89),
-        # ("fp8", "tensorrt_llm", "sparsegpt", None),
-        PTQCommand(quant="w4a8_awq", min_sm=89),
+        PTQCommand(quant="w4a8_awq", kv_cache_quant="none", min_sm=89),
+        # sm100
+        PTQCommand(quant="nvfp4", min_sm=100),
         #
         # multi_gpu
-        # TP
-        PTQCommand(quant="fp16", tp=2, pp=1, min_gpu=2),
-        # ("fp16", "build", "sparsegpt", 1),
-        PTQCommand(quant="nvfp4", tp=2, pp=1, min_gpu=2),
-        PTQCommand(quant="fp16", tasks="benchmark", tp=2, pp=1, min_gpu=2),
-        # ("fp16", "benchmark", "sparsegpt", 2, 1),
-        # PP
-        # ("nvfp4", "build", None, 1, 2),
-        # ("fp16", "build", None, 1, 2),
-        # ("fp16", "build", "sparsegpt", 1, 2),
+        PTQCommand(quant="fp16", min_gpu=2),
+        PTQCommand(quant="nvfp4", min_gpu=2),
+        PTQCommand(quant="fp16", tasks="benchmark", min_gpu=2),
+        PTQCommand(quant="fp8", tasks="benchmark", min_gpu=2),
+        PTQCommand(quant="nvfp4", tasks="benchmark", min_gpu=2),
+        PTQCommand(quant="fp16", tasks="benchmark", min_gpu=2, batch=1),
+        PTQCommand(quant="fp8", tasks="benchmark", min_gpu=2, batch=1),
+        PTQCommand(quant="nvfp4", tasks="benchmark", min_gpu=2, batch=1),
     ],
     ids=PTQCommand.param_str,
 )
diff --git a/tests/examples/vlm_ptq/test_llava.py b/tests/examples/vlm_ptq/test_llava.py
@@ -16,11 +16,11 @@
 
 import pytest
 from _test_utils.examples.run_command import run_vlm_ptq_command
-from _test_utils.model import LLAVA_PATH
+from _test_utils.model import LLAMA3_2_PATH
 from _test_utils.torch_misc import minimum_gpu
 
 
 @pytest.mark.parametrize("quant", ["fp16"])
 @minimum_gpu(2)
 def test_llava_multi_gpu(quant):
-    run_vlm_ptq_command(model=LLAVA_PATH, type="llava", quant=quant, tp=2)
+    run_vlm_ptq_command(model=LLAMA3_2_PATH, quant=quant, tp=2)

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ def test_llama_eval_fp8(tiny_llama_path):`
`30`	`30`	`lm_eval_tasks="hellaswag,gsm8k",`
`31`	`31`	`simple_eval_tasks="humaneval",`
`32`	`32`	`lm_eval_limit=0.1,`
	`33`	`+ batch=8,`
`33`	`34`	`)`
`34`	`35`	`finally:`
`35`	`36`	`# Force kill llm-serve if it's still running`