NVIDIA · kevalmorabia97 · Sep 17, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 8, 2025
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,10 @@ Model Optimizer Changelog (Linux)
 
 **Deprecations**
 
+- TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``.
+- ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
+- ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
+
 **Bug Fixes**
 
 **New Features**

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6
+FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2
 
 ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com"
 ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL \

@@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t
 ### TensorRT-LLM
 
 ```sh
-python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,engine_dir=<TRT LLM engine dir> --tasks <comma separated tasks> --batch_size <engine batch size>
+python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,engine_dir=<Quantized checkpoint dir> --tasks <comma separated tasks> --batch_size <engine batch size>
 ```
 
 ## MMLU
@@ -140,7 +140,7 @@ python mmlu.py --model_name causal --model_path <HF model folder or model card>
 ### Evaluate the TensorRT-LLM engine
 
 ```bash
-python mmlu.py --model_name causal --model_path <HF model folder or model card> --engine_dir <built TensorRT-LLM folder>
+python mmlu.py --model_name causal --model_path <HF model folder or model card> --engine_dir <Quantized checkpoint dir>
 ```
 
 ## MT-Bench
@@ -163,7 +163,7 @@ bash run_fastchat.sh -h <HF model folder or model card> --quant_cfg MODELOPT_QUA
 ### Evaluate the TensorRT-LLM engine
 
 ```bash
-bash run_fastchat.sh -h <HF model folder or model card> <built TensorRT-LLM folder>
+bash run_fastchat.sh -h <HF model folder or model card> <Quantized checkpoint dir>
 ```
 
 ### Judging the responses

@@ -89,28 +89,20 @@ def auto_quantize(
     qformat_list = qformat.split(",")
     assert qformat_list, "No quantization formats provided"
     # Check if all provided quantization formats are supported
-    if args.export_fmt == "hf":
-        assert all(
-            qformat
-            in [
-                "fp8",
-                "int4_awq",
-                "nvfp4",
-                "nvfp4_awq",
-                "w4a8_awq",
-                "fp8_pb_wo",
-                "w4a8_mxfp4_fp8",
-                "nvfp4_mlp_only",
-            ]
-            for qformat in qformat_list
-        ), (
-            "One or more quantization formats provided are not supported for unified checkpoint export"
-        )
-    else:
-        assert all(
-            qformat in ["fp8", "int8_sq", "int4_awq", "w4a8_awq", "nvfp4", "nvfp4_awq"]
-            for qformat in qformat_list
-        ), "One or more quantization formats provided are not supported for tensorrt llm export"
+    assert all(
+        qformat
+        in [
+            "fp8",
+            "int4_awq",
+            "nvfp4",
+            "nvfp4_awq",
+            "w4a8_awq",
+            "fp8_pb_wo",
+            "w4a8_mxfp4_fp8",
+            "nvfp4_mlp_only",
+        ]
+        for qformat in qformat_list
+    ), "One or more quantization formats provided are not supported for unified checkpoint export"
 
     def loss_func(output, data):
         # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
@@ -219,27 +211,21 @@ def main(args):
             "Quantization supports only one quantization format."
         )
 
-    # Check arguments for unified_hf export format and set to default if unsupported arguments are provided
-    if args.export_fmt == "hf":
-        assert args.sparsity_fmt == "dense", (
-            f"Sparsity format {args.sparsity_fmt} not supported by unified export api."
-        )
-
-        if not args.auto_quantize_bits:
-            assert (
-                args.qformat
-                in [
-                    "int4_awq",
-                    "fp8",
-                    "nvfp4",
-                    "nvfp4_awq",
-                    "w4a8_awq",
-                    "fp8_pb_wo",
-                    "w4a8_mxfp4_fp8",
-                    "nvfp4_mlp_only",
-                ]
-                or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES
-            ), f"Quantization format {args.qformat} not supported for HF export path"
+    if not args.auto_quantize_bits:
+        assert (
+            args.qformat
+            in [
+                "int4_awq",
+                "fp8",
+                "nvfp4",
+                "nvfp4_awq",
+                "w4a8_awq",
+                "fp8_pb_wo",
+                "w4a8_mxfp4_fp8",
+                "nvfp4_mlp_only",
+            ]
+            or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES
+        ), f"Quantization format {args.qformat} not supported for HF export path"
 
     # If low memory mode is enabled, we compress the model while loading the HF checkpoint.
     calibration_only = False
@@ -253,9 +239,6 @@ def main(args):
             attn_implementation=args.attn_implementation,
         )
     else:
-        assert args.export_fmt == "hf", (
-            "Low memory mode is only supported for exporting HF checkpoint."
-        )
         assert args.qformat in QUANT_CFG_CHOICES, (
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
         )
@@ -600,7 +583,10 @@ def output_decode(generated_ids, input_shape):
             setattr(model.config, "architectures", full_model_config.architectures)
 
         start_time = time.time()
-        if args.export_fmt == "tensorrt_llm":
+        if model_type in ["t5", "bart", "whisper"] or args.sparsity_fmt != "dense":
+            # Still export TensorRT-LLM checkpoints for the models not supported by the
+            # TensorRT-LLM torch runtime.
+
             # Move meta tensor back to device before exporting.
             remove_hook_from_module(model, recurse=True)
 
@@ -621,13 +607,16 @@ def output_decode(generated_ids, input_shape):
                 inference_tensor_parallel=args.inference_tensor_parallel,
                 inference_pipeline_parallel=args.inference_pipeline_parallel,
             )
-        elif args.export_fmt == "hf":
+        else:
+            # Check arguments for unified_hf export format and set to default if unsupported arguments are provided
+            assert args.sparsity_fmt == "dense", (
+                f"Sparsity format {args.sparsity_fmt} not supported by unified export api."
+            )
+
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
             )
-        else:
-            raise NotImplementedError(f"{args.export_fmt} not supported")
 
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
@@ -710,9 +699,9 @@ def output_decode(generated_ids, input_shape):
     parser.add_argument(
         "--export_fmt",
         required=False,
-        default="tensorrt_llm",
+        default="hf",
         choices=["tensorrt_llm", "hf"],
-        help=("Checkpoint export format"),
+        help="Deprecated. Please avoid using this argument.",
     )
     parser.add_argument(
         "--trust_remote_code",
@@ -767,6 +756,9 @@ def output_decode(generated_ids, input_shape):
 
     args = parser.parse_args()
 
+    if args.export_fmt != "hf":
+        warnings.warn("Deprecated. --export_fmt will be ignored.")
+
     args.dataset = args.dataset.split(",") if args.dataset else None
     args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
     main(args)