NVIDIA · cjluo-nv · Sep 25, 2025 · Sep 24, 2025 · Sep 25, 2025 · coderabbitai
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,7 +8,7 @@ Model Optimizer Changelog (Linux)
 
 - Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide <https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html>`_ for more details.
 - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
-- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
+- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. ``engine_dir`` is replaced with ``checkpoint_dir`` in ``examples/llm_ptq`` and ``examples/vlm_ptq``. For performance evaluation, please use ``trtllm-bench`` directly.
 - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
 - Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
 

@@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t
 ### TensorRT-LLM
 
 ```sh
-python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,engine_dir=<Quantized checkpoint dir> --tasks <comma separated tasks> --batch_size <engine batch size>
+python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,checkpoint_dir=<Quantized checkpoint dir> --tasks <comma separated tasks> --batch_size <max batch size>
 ```
 
 ## MMLU
@@ -137,10 +137,10 @@ python mmlu.py --model_name causal --model_path <HF model folder or model card>
 python mmlu.py --model_name causal --model_path <HF model folder or model card> --quant_cfg $MODELOPT_QUANT_CFG_TO_SEARCH --auto_quantize_bits $EFFECTIVE_BITS --batch_size 4
 ```
 
-### Evaluate the TensorRT-LLM engine
+### Evaluate with TensorRT-LLM
 
 ```bash
-python mmlu.py --model_name causal --model_path <HF model folder or model card> --engine_dir <Quantized checkpoint dir>
+python mmlu.py --model_name causal --model_path <HF model folder or model card> --checkpoint_dir <Quantized checkpoint dir>
 ```
 
 ## MT-Bench
@@ -160,7 +160,7 @@ bash run_fastchat.sh -h <HF model folder or model card>
 bash run_fastchat.sh -h <HF model folder or model card> --quant_cfg MODELOPT_QUANT_CFG
 ```
 
-### Evaluate the TensorRT-LLM engine
+### Evaluate with TensorRT-LLM
 
 ```bash
 bash run_fastchat.sh -h <HF model folder or model card> <Quantized checkpoint dir>

@@ -118,7 +118,7 @@ def run_eval(
     max_gpu_memory,
     dtype,
     revision,
-    engine_dir,
+    checkpoint_dir,
     nim_model,
     args,
 ):
@@ -150,7 +150,7 @@ def run_eval(
             revision=revision,
             top_p=top_p,
             temperature=temperature,
-            engine_dir=engine_dir,
+            checkpoint_dir=checkpoint_dir,
             nim_model=nim_model,
         )
-            checkpoint_dir=checkpoint_dir,
-            nim_model=nim_model,
-        )
+            checkpoint_dir=checkpoint_dir,
+            trust_remote_code=args.trust_remote_code,
+            quant_cfg=args.quant_cfg,
+            calib_batch_size=args.calib_batch_size,
+            calib_size=args.calib_size,
+            auto_quantize_bits=args.auto_quantize_bits,
+            nim_model=nim_model,
+        )
-            checkpoint_dir=checkpoint_dir,
-            nim_model=nim_model,
-        )
+            checkpoint_dir=checkpoint_dir,
+            trust_remote_code=args.trust_remote_code,
+            quant_cfg=args.quant_cfg,
+            calib_batch_size=args.calib_batch_size,
+            calib_size=args.calib_size,
+            auto_quantize_bits=args.auto_quantize_bits,
+            nim_model=nim_model,
+        )
         for i in range(0, len(questions), chunk_size)
@@ -174,25 +174,22 @@ def get_model_answers(
     revision,
     top_p=None,
     temperature=None,
-    engine_dir=None,
+    checkpoint_dir=None,
     nim_model=None,
 ):
     # Model Optimizer modification
-    if engine_dir:
-        tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
-        if engine_dir:
-            # get model type
-            last_part = os.path.basename(engine_dir)
-            model_type = last_part.split("_")[0]
-            # Some models require to set pad_token and eos_token based on external config (e.g., qwen)
-            if model_type == "qwen":
-                tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
-                tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
-
-            assert LLM is not None, "tensorrt_llm APIs could not be imported."
-            model = LLM(engine_dir, tokenizer=tokenizer)
-        else:
-            raise ValueError("engine_dir is required for TensorRT LLM inference.")
+    tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
+    if checkpoint_dir:
+        # get model type
+        last_part = os.path.basename(checkpoint_dir)
+        model_type = last_part.split("_")[0]
+        # Some models require to set pad_token and eos_token based on external config (e.g., qwen)
+        if model_type == "qwen":
+            tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
+            tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
+
+        assert LLM is not None, "tensorrt_llm APIs could not be imported."
+        model = LLM(checkpoint_dir, tokenizer=tokenizer)
     elif not nim_model:
         model, _ = load_model(
             model_path,
@@ -205,7 +202,6 @@ def get_model_answers(
             cpu_offloading=False,
             debug=False,
         )
-        tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
         if args.quant_cfg:
             quantize_model(
                 model,
@@ -259,7 +255,7 @@ def get_model_answers(
 
                 # some models may error out when generating long outputs
                 try:
-                    if not engine_dir:
+                    if not checkpoint_dir:
                         output_ids = model.generate(
                             torch.as_tensor(input_ids).cuda(),
                             do_sample=do_sample,
@@ -427,9 +423,9 @@ def reorg_answer_file(answer_file):
         help="The model revision to load.",
     )
     parser.add_argument(
-        "--engine-dir",
+        "--checkpoint-dir",
         type=str,
-        help="The path to the TensorRT LLM engine directory.",
+        help="The path to the model checkpoint directory.",
     )
     parser.add_argument(
         "--nim-model",
@@ -502,7 +498,7 @@ def reorg_answer_file(answer_file):
         max_gpu_memory=args.max_gpu_memory,
         dtype=str_to_torch_dtype(args.dtype),
         revision=args.revision,
-        engine_dir=args.engine_dir,
+        checkpoint_dir=args.checkpoint_dir,
         nim_model=args.nim_model,
         args=args,
     )

@@ -42,7 +42,7 @@ class TRTLLM(TemplateAPI):
     def __init__(
         self,
         tokenizer: str,
-        engine_dir: str,
+        checkpoint_dir: str,
         batch_size: int = 1,
         **kwargs,
     ):
@@ -56,11 +56,11 @@ def __init__(
         if self.tokenizer.pad_token_id is None:
             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
 
-        assert isinstance(engine_dir, str)
+        assert isinstance(checkpoint_dir, str)
 
-        self.llm = LLM(checkpoint_dir=engine_dir, tokenizer=self.tokenizer)
+        self.llm = LLM(checkpoint_dir=checkpoint_dir, tokenizer=self.tokenizer)
         self.max_length = self.llm.max_seq_len - 1
-        logger.info("Loaded TRT-LLM engine")
+        logger.info("Loaded TRT-LLM")
 
     def model_call(
         self,

@@ -252,9 +252,9 @@ def main(
     mto.enable_huggingface_checkpointing()
     model_path = kwargs["model_path"]
     tokenizer = get_tokenizer(model_path, trust_remote_code=kwargs.get("trust_remote_code", False))
-    if kwargs.get("engine_dir"):
+    if kwargs.get("checkpoint_dir"):
         # get model type
-        last_part = os.path.basename(kwargs["engine_dir"])
+        last_part = os.path.basename(kwargs["checkpoint_dir"])
         model_type = last_part.split("_")[0]
         # Some models require to set pad_token and eos_token based on external config (e.g., qwen)
         if model_type == "qwen":
@@ -264,7 +264,9 @@ def main(
         assert LLM is not None, "tensorrt_llm APIs could not be imported."
         medusa_choices = kwargs.get("medusa_choices")
         model = LLM(
-            checkpoint_dir=kwargs["engine_dir"], tokenizer=tokenizer, medusa_choices=medusa_choices
+            checkpoint_dir=kwargs["checkpoint_dir"],
+            tokenizer=tokenizer,
+            medusa_choices=medusa_choices,
         )
     else:
         model = select_model(

@@ -20,18 +20,18 @@
 # If you are using NIM, ensure that you export the NIM API key using:
 # export OPENAI_API_KEY=<NIM_API_KEY>
 #
-# Usage: bash run_fastchat.sh -h <HF model folder or model card> -e <engine_dir> -n <NIM model model card>
+# Usage: bash run_fastchat.sh -h <HF model folder or model card> -e <checkpoint_dir> -n <NIM model model card>
 # model_name: The HuggingFace handle or folder of the model to evaluate.
-# engine_dir: The directory where the TRT-LLM engine is stored.
+# checkpoint_dir: The directory where the checkpoint is stored.
 # nim_model_name: The handle of the NIM model to be used for evaluation.
 #
 # Example commands:
 #
 # Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model:
 # bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct
 #
-# Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with TRT-LLM engine:
-# bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -e /path/to/engine_dir
+# Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with TRT-LLM:
+# bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -e /path/to/checkpoint_dir
 #
 # Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with NIM:
 # bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -n meta-llama/Meta-Llama-3-8B-Instruct
@@ -41,7 +41,7 @@ set -e
 set -x
 
 hf_model_name=""
-engine_dir=""
+checkpoint_dir=""
 nim_model_name=""
 answer_file=""
 quant_cfg=""
@@ -56,9 +56,9 @@ while [[ "$1" != "" ]]; do
             shift
             hf_model_name=$1
             ;;
-        -e | --engine_dir )
+        -e | --checkpoint_dir )
             shift
-            engine_dir=$1
+            checkpoint_dir=$1
             ;;
         -n | --nim_model_name )
             shift
@@ -96,8 +96,8 @@ if [ "$hf_model_name" == "" ]; then
     exit 1
 fi
 
-if [ "$engine_dir" != "" ]; then
-    engine_dir=" --engine-dir $engine_dir "
+if [ "$checkpoint_dir" != "" ]; then
+    checkpoint_dir=" --checkpoint-dir $checkpoint_dir "
 fi
 
 if [ "$nim_model_name" != "" ]; then
@@ -143,7 +143,7 @@ PYTHONPATH=FastChat:$PYTHONPATH python gen_model_answer.py \
     --model-id $hf_model_name \
     --temperature 0.0001 \
     --top-p 0.0001 \
-    $engine_dir \
+    $checkpoint_dir \
     $nim_model_name \
     $answer_file \
     $quant_args
@@ -36,16 +36,6 @@ def is_speculative(hf_config):
     )
 
 
-def get_mode_type_from_engine_dir(engine_dir_str):
-    # Split the path by '/' and get the last part
-    last_part = os.path.basename(engine_dir_str)
-
-    # Split the last part by '_' and get the first segment
-    model_type = last_part.split("_")[0]
-
-    return model_type
-
-
 def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs):
     print(f"Initializing tokenizer from {ckpt_path}")
 

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""An example script to run the tensorrt_llm engine."""
+"""An example script to run the tensorrt_llm inference."""
 
 import argparse
 
@@ -28,7 +28,7 @@ def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument("--tokenizer", type=str, default="")
     parser.add_argument("--max_output_len", type=int, default=100)
-    parser.add_argument("--engine_dir", type=str, default="/tmp/modelopt")
+    parser.add_argument("--checkpoint_dir", type=str)
     parser.add_argument(
         "--input_texts",
         type=str,
@@ -49,8 +49,8 @@ def parse_arguments():
 
 def run(args):
     if not args.tokenizer:
-        # Assume the tokenizer files are saved in the engine_dr.
-        args.tokenizer = args.engine_dir
+        # Assume the tokenizer files are saved in the checkpoint_dir.
+        args.tokenizer = args.checkpoint_dir
 
     if isinstance(args.tokenizer, PreTrainedTokenizerBase):
         tokenizer = args.tokenizer
@@ -66,7 +66,7 @@ def run(args):
 
     print("TensorRT-LLM example outputs:")
 
-    llm = LLM(args.engine_dir, tokenizer=tokenizer, max_batch_size=len(input_texts))
+    llm = LLM(args.checkpoint_dir, tokenizer=tokenizer, max_batch_size=len(input_texts))
     torch.cuda.cudart().cudaProfilerStart()
     outputs = llm.generate_text(input_texts, args.max_output_len)
     torch.cuda.cudart().cudaProfilerStop()

@@ -158,7 +158,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage"
     fi
 
-    # for enc-dec model, users need to refer TRT-LLM example to build engines and deployment
+    # for enc-dec model, users need to refer TRT-LLM example for deployment
     if [[ -f "$SAVE_PATH/encoder/config.json" && -f "$SAVE_PATH/decoder/config.json" && ! -f $MODEL_CONFIG ]]; then
         echo "Please continue to deployment with the TRT-LLM enc_dec example, https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/enc_dec. Checkpoint export_path: $SAVE_PATH"
         exit 0
@@ -187,7 +187,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         RUN_ARGS+=" --trust_remote_code "
     fi
 
-    python run_tensorrt_llm.py --engine_dir=$SAVE_PATH $RUN_ARGS
+    python run_tensorrt_llm.py --checkpoint_dir=$SAVE_PATH $RUN_ARGS
 fi
 
 if [[ -d "${MODEL_PATH}" ]]; then
@@ -229,7 +229,7 @@ if [[ $TASKS =~ "lm_eval" ]]; then
 
     python lm_eval_tensorrt_llm.py \
         --model trt-llm \
-        --model_args tokenizer=$MODEL_PATH,engine_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \
+        --model_args tokenizer=$MODEL_PATH,checkpoint_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \
         --tasks $LM_EVAL_TASKS \
         --batch_size $BUILD_MAX_BATCH_SIZE $lm_eval_flags | tee $LM_EVAL_RESULT
 
@@ -259,7 +259,7 @@ if [[ $TASKS =~ "mmlu" ]]; then
     python mmlu.py \
         --model_name causal \
         --model_path $MODEL_ABS_PATH \
-        --engine_dir $SAVE_PATH \
+        --checkpoint_dir $SAVE_PATH \
         --data_dir $MMLU_DATA_PATH | tee $MMLU_RESULT
     popd
 

@@ -56,7 +56,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the-
 
 Please refer to the [llm_ptq/README.md](../llm_ptq/README.md) about the details of model quantization.
 
-The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA, Phi-3-vision and Qwen2.5-VL models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU.
+The following scripts provide an all-in-one and step-by-step model quantization example for the supported Hugging Face multi-modal models. The quantization format and the number of GPUs will be supplied as inputs to these scripts.
 
 ### Hugging Face Example [Script](./scripts/huggingface_example.sh)