[NVBUG: 5619158] Enforce high precision model dtype for diffusion trt (#526)

ajrasane · web-flow · commit 229053323ae2 · 2025-11-06T23:44:37.000-08:00
## What does this PR do? **Type of change:** Minor code change **Overview:** - Select the high precision dtype directly based on model type - FP16 for Stable Diffusion models, BF16 for Flux ## Testing ```python python diffusion_trt.py --model flux-dev --benchmark ``` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: No (No option to specify dtype while loading pipeline) - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: Yes - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes --------- Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -30,6 +30,7 @@ Model Optimizer Changelog (Linux)
 - Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
 - Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
 - Add flags ``nodes_to_include`` and ``op_types_to_include`` in AutoCast to force-include nodes in low precision, even if they would otherwise be excluded by other rules.
+- Add support for ``torch.compile`` and benchmarking in ``examples/diffusers/quantization/diffusion_trt.py``.
 
 **Documentation**
 
diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
@@ -307,11 +307,16 @@ Generate images for the quantized checkpoint with the following [Script](./quant
 python diffusion_trt.py \
     --model {sdxl-1.0|sdxl-turbo|sd3-medium|flux-dev} \
     --prompt "A cat holding a sign that says hello world" \
+    [--override-model-path /path/to/model] \
     [--restore-from ./{MODEL}_fp8.pt] \
     [--onnx-load-path {ONNX_DIR}] \
     [--trt-engine-load-path {ENGINE_DIR}] \
-    [--dq_only] \
-    [--torch]
+    [--dq-only] \
+    [--torch] \
+    [--save-image-as /path/to/image] \
+    [--benchmark] \
+    [--torch-compile] \
+    [--skip-image]
 ```
 
 This script will save the output image as `./{MODEL}.png` and report the latency of the TensorRT backbone.
diff --git a/examples/diffusers/quantization/diffusion_trt.py b/examples/diffusers/quantization/diffusion_trt.py
@@ -40,10 +40,12 @@
     "flux-schnell": ModelType.FLUX_SCHNELL,
 }
 
-dtype_map = {
-    "Half": torch.float16,
-    "BFloat16": torch.bfloat16,
-    "Float": torch.float32,
+DTYPE_MAP = {
+    "sdxl-1.0": torch.float16,
+    "sdxl-turbo": torch.float16,
+    "sd3-medium": torch.float16,
+    "flux-dev": torch.bfloat16,
+    "flux-schnell": torch.bfloat16,
 }
 
 
@@ -60,7 +62,7 @@ def generate_image(pipe, prompt, image_name):
 
 
 def benchmark_model(
-    pipe, prompt, num_warmup=10, num_runs=50, num_inference_steps=20, model_dtype="Half"
+    pipe, prompt, num_warmup=10, num_runs=50, num_inference_steps=20, model_dtype=torch.float16
 ):
     """Benchmark the backbone model inference time."""
     backbone = pipe.transformer if hasattr(pipe, "transformer") else pipe.unet
@@ -83,7 +85,7 @@ def forward_hook(_module, _input, _output):
     try:
         print(f"Starting warmup: {num_warmup} runs")
         for _ in tqdm(range(num_warmup), desc="Warmup"):
-            with torch.amp.autocast("cuda", dtype=dtype_map[model_dtype]):
+            with torch.amp.autocast("cuda", dtype=model_dtype):
                 _ = pipe(
                     prompt,
                     output_type="pil",
@@ -95,7 +97,7 @@ def forward_hook(_module, _input, _output):
 
         print(f"Starting benchmark: {num_runs} runs")
         for _ in tqdm(range(num_runs), desc="Benchmark"):
-            with torch.amp.autocast("cuda", dtype=dtype_map[model_dtype]):
+            with torch.amp.autocast("cuda", dtype=model_dtype):
                 _ = pipe(
                     prompt,
                     output_type="pil",
@@ -126,13 +128,6 @@ def main():
         default=None,
         help="Path to the model if not using default paths in MODEL_ID mapping.",
     )
-    parser.add_argument(
-        "--model-dtype",
-        type=str,
-        default="Half",
-        choices=["Half", "BFloat16", "Float"],
-        help="Precision used to load the model.",
-    )
     parser.add_argument(
         "--restore-from", type=str, default=None, help="Path to the modelopt quantized checkpoint"
     )
@@ -167,10 +162,11 @@ def main():
     args = parser.parse_args()
 
     image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
+    model_dtype = DTYPE_MAP[args.model]
 
     pipe = PipelineManager.create_pipeline_from(
         MODEL_ID[args.model],
-        dtype_map[args.model_dtype],
+        torch_dtype=model_dtype,
         override_model_path=args.override_model_path,
     )
 
@@ -189,9 +185,6 @@ def main():
         mto.restore(backbone, args.restore_from)
 
     if args.torch_compile:
-        assert args.model_dtype in ["BFloat16", "Float", "Half"], (
-            "torch.compile() only supports BFloat16 and Float"
-        )
         print("Compiling backbone with torch.compile()...")
         backbone = torch.compile(backbone, mode="max-autotune")
 
@@ -203,7 +196,7 @@ def main():
         pipe.to("cuda")
 
         if args.benchmark:
-            benchmark_model(pipe, args.prompt, model_dtype=args.model_dtype)
+            benchmark_model(pipe, args.prompt, model_dtype=model_dtype)
 
         if not args.skip_image:
             generate_image(pipe, args.prompt, image_name)