[NVBUG: 5619158] Enfore high precision model dtype for diffusion trt

ajrasane · ajrasane · commit 17e07d0b2163 · 2025-11-07T03:10:58.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -30,6 +30,7 @@ Model Optimizer Changelog (Linux)
 - Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
 - Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
 - Add flags ``nodes_to_include`` and ``op_types_to_include`` in AutoCast to force-include nodes in low precision, even if they would otherwise be excluded by other rules.
+- Add support for ``torch.compile`` and benchmarking in ``examples/diffusers/quantization/diffusion_trt.py``.
 
 **Documentation**
 
diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
@@ -307,11 +307,16 @@ Generate images for the quantized checkpoint with the following [Script](./quant
 python diffusion_trt.py \
     --model {sdxl-1.0|sdxl-turbo|sd3-medium|flux-dev} \
     --prompt "A cat holding a sign that says hello world" \
+    [--override-model-path /path/to/model] \
     [--restore-from ./{MODEL}_fp8.pt] \
     [--onnx-load-path {ONNX_DIR}] \
     [--trt-engine-load-path {ENGINE_DIR}] \
-    [--dq_only] \
-    [--torch]
+    [--dq-only] \
+    [--torch] \
+    [--save-image-as /path/to/image] \
+    [--benchmark] \
+    [--torch-compile] \
+    [--skip-image]
 ```
 
 This script will save the output image as `./{MODEL}.png` and report the latency of the TensorRT backbone.
diff --git a/examples/diffusers/quantization/diffusion_trt.py b/examples/diffusers/quantization/diffusion_trt.py
@@ -126,13 +126,6 @@ def main():
         default=None,
         help="Path to the model if not using default paths in MODEL_ID mapping.",
     )
-    parser.add_argument(
-        "--model-dtype",
-        type=str,
-        default="Half",
-        choices=["Half", "BFloat16", "Float"],
-        help="Precision used to load the model.",
-    )
     parser.add_argument(
         "--restore-from", type=str, default=None, help="Path to the modelopt quantized checkpoint"
     )
@@ -170,28 +163,27 @@ def main():
 
     pipe = PipelineManager.create_pipeline_from(
         MODEL_ID[args.model],
-        dtype_map[args.model_dtype],
         override_model_path=args.override_model_path,
     )
 
     # Save the backbone of the pipeline and move it to the GPU
     add_embedding = None
     backbone = None
+    model_dtype = None
     if hasattr(pipe, "transformer"):
         backbone = pipe.transformer
+        model_dtype = "Bfloat16"
     elif hasattr(pipe, "unet"):
         backbone = pipe.unet
         add_embedding = backbone.add_embedding
+        model_dtype = "Half"
     else:
         raise ValueError("Pipeline does not have a transformer or unet backbone")
 
     if args.restore_from:
         mto.restore(backbone, args.restore_from)
 
     if args.torch_compile:
-        assert args.model_dtype in ["BFloat16", "Float", "Half"], (
-            "torch.compile() only supports BFloat16 and Float"
-        )
         print("Compiling backbone with torch.compile()...")
         backbone = torch.compile(backbone, mode="max-autotune")
 
@@ -203,7 +195,7 @@ def main():
         pipe.to("cuda")
 
         if args.benchmark:
-            benchmark_model(pipe, args.prompt, model_dtype=args.model_dtype)
+            benchmark_model(pipe, args.prompt, model_dtype=model_dtype)
 
         if not args.skip_image:
             generate_image(pipe, args.prompt, image_name)