fix trt engine building of the diffusers pipelines (NVIDIA#637)

shengliangxu · kevalmorabia97 · web-flow · commit e0a6efbe7092 · 2025-12-03T10:34:15.000Z
## What does this PR do?

**Type of change:**

Bug fix

**Overview:**

1. The diffusion_trt.py needs the dynamic_shapes when running trtexec
for engine building. A previous change altered the format of
dynamic_shapes, fix it here.

2. the dynamic_shapes logic gets cleaned up. The existing logic is very
confusing

3. recover min-batch_size config for some pipelines. Previously some
pipelines set the min batch_size to be &gt; 1, which was odd, so a previous
change sets them to be 1, but it turns out the oddity has a reason, the
trt engine building fails with the altered batch_size min/opt, thus
recover them.


## Testing

pytest tests/examples/diffusers

---------

Signed-off-by: Shengliang Xu &lt;shengliangx@nvidia.com&gt;
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
Co-authored-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
@@ -39,9 +39,10 @@ Install Model Optimizer with `onnx` and `hf` dependencies using `pip` from [PyPI
 
 ```bash
 pip install nvidia-modelopt[onnx,hf]
+pip install -r requirements.txt
 ```
 
-Each subsection (cache_diffusion, quantization, etc.) have their own `requirements.txt` file that needs to be installed separately.
+Each subsection (eval, etc.) may have their own `requirements.txt` file that needs to be installed separately.
 
 You can find the latest TensorRT [here](https://developer.nvidia.com/tensorrt/download).
 
diff --git a/examples/diffusers/cache_diffusion/requirements.txt b/examples/diffusers/cache_diffusion/requirements.txt
diff --git a/examples/diffusers/quantization/diffusion_trt.py b/examples/diffusers/quantization/diffusion_trt.py
@@ -19,6 +19,7 @@
 import numpy as np
 import torch
 from onnx_utils.export import (
+    _create_trt_dynamic_shapes,
     generate_dummy_inputs_and_dynamic_axes_and_shapes,
     get_io_shapes,
     remove_nesting,
@@ -186,11 +187,13 @@ def main():
 
     if args.torch_compile:
         assert args.torch, "Torch mode must be enabled when torch_compile is used"
-    # Save the backbone of the pipeline and move it to the GPU
+    # Save the backbone (and other attributes) of the pipeline and move it to the GPU
     add_embedding = None
-    backbone = None
+    cache_context = None
     if hasattr(pipe, "transformer"):
         backbone = pipe.transformer
+        if hasattr(backbone, "cache_context"):
+            cache_context = backbone.cache_context
     elif hasattr(pipe, "unet"):
         backbone = pipe.unet
         add_embedding = backbone.add_embedding
@@ -234,13 +237,13 @@ def main():
     if args.onnx_load_path == "":
         update_dynamic_axes(args.model, dynamic_axes)
 
-    compilation_args = dynamic_shapes
+    trt_dynamic_shapes = _create_trt_dynamic_shapes(dynamic_shapes)
 
     # We only need to remove the nesting for SDXL models as they contain the nested input added_cond_kwargs
     # which are renamed by the DeviceModel
     ignore_nesting = False
     if args.onnx_load_path != "" and args.model in ["sdxl-1.0", "sdxl-turbo"]:
-        remove_nesting(compilation_args)
+        remove_nesting(trt_dynamic_shapes)
         ignore_nesting = True
 
     # Define deployment configuration
@@ -268,6 +271,7 @@ def main():
     del backbone
     torch.cuda.empty_cache()
 
+    compilation_args = {"dynamic_shapes": trt_dynamic_shapes}
     if not args.trt_engine_load_path:
         # Compile the TRT engine from the exported ONNX model
         compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
@@ -289,18 +293,18 @@ def main():
         compiled_model,
         metadata,
         compilation_args,
-        get_io_shapes(args.model, args.onnx_load_path, dynamic_shapes),
+        get_io_shapes(args.model, args.onnx_load_path, trt_dynamic_shapes),
         ignore_nesting,
     )
 
-    if hasattr(pipe, "unet") and add_embedding:
-        setattr(device_model, "add_embedding", add_embedding)
-
-    # Set the backbone to the device model
-    if hasattr(pipe, "unet"):
-        pipe.unet = device_model
-    elif hasattr(pipe, "transformer"):
+    # Set the backbone and other attributes to the device model
+    if hasattr(pipe, "transformer"):
         pipe.transformer = device_model
+        if cache_context:
+            device_model.cache_context = cache_context
+    elif hasattr(pipe, "unet"):
+        pipe.unet = device_model
+        device_model.add_embedding = add_embedding
     else:
         raise ValueError("Pipeline does not have a transformer or unet backbone")
     pipe.to("cuda")
diff --git a/examples/diffusers/quantization/onnx_utils/export.py b/examples/diffusers/quantization/onnx_utils/export.py
@@ -368,38 +368,36 @@ def update_dynamic_axes(model_id, dynamic_axes):
         dynamic_axes["out.0"] = dynamic_axes.pop("out_hidden_states")
 
 
-def _create_dynamic_shapes(dynamic_shapes):
+def _create_trt_dynamic_shapes(dynamic_shapes):
     min_shapes = {}
     opt_shapes = {}
     for key, value in dynamic_shapes.items():
         min_shapes[key] = value["min"]
         opt_shapes[key] = value["opt"]
     return {
-        "dynamic_shapes": {
-            "minShapes": min_shapes,
-            "optShapes": opt_shapes,
-            "maxShapes": opt_shapes,
-        }
+        "minShapes": min_shapes,
+        "optShapes": opt_shapes,
+        "maxShapes": opt_shapes,
     }
 
 
 def generate_dummy_inputs_and_dynamic_axes_and_shapes(model_id, backbone):
     """Generate dummy inputs, dynamic axes, and dynamic shapes for the given model."""
     if model_id in ["sdxl-1.0", "sdxl-turbo"]:
         dummy_kwargs, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_sdxl(
-            backbone, min_bs=1, opt_bs=16
+            backbone, min_bs=2, opt_bs=16
         )
     elif model_id in ["sd3-medium", "sd3.5-medium"]:
         dummy_kwargs, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_sd3(
-            backbone, min_bs=1, opt_bs=16
+            backbone, min_bs=2, opt_bs=16
         )
     elif model_id in ["flux-dev", "flux-schnell"]:
         dummy_kwargs, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_flux(
-            backbone, min_bs=1, opt_bs=2
+            backbone, min_bs=1, opt_bs=1
         )
     elif model_id == "ltx-video-dev":
         dummy_kwargs, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_ltx(
-            backbone, min_bs=1, opt_bs=2
+            backbone, min_bs=2, opt_bs=2
         )
     elif model_id == "wan2.2-t2v-14b":
         dummy_kwargs, dynamic_shapes = _gen_dummy_inp_and_dyn_shapes_wan(
@@ -414,7 +412,7 @@ def generate_dummy_inputs_and_dynamic_axes_and_shapes(model_id, backbone):
     return dummy_kwargs, dynamic_axes, dynamic_shapes
 
 
-def get_io_shapes(model_id, onnx_load_path, dynamic_shapes):
+def get_io_shapes(model_id, onnx_load_path, trt_dynamic_shapes):
     output_name = "out.0"
     if onnx_load_path != "":
         if model_id in ["sdxl-1.0", "sdxl-turbo"]:
@@ -429,28 +427,28 @@ def get_io_shapes(model_id, onnx_load_path, dynamic_shapes):
             raise NotImplementedError(f"Unsupported model_id: {model_id}")
 
     if model_id in ["sdxl-1.0", "sdxl-turbo"]:
-        io_shapes = {output_name: dynamic_shapes["dynamic_shapes"]["minShapes"]["sample"]}
+        io_shapes = {output_name: trt_dynamic_shapes["minShapes"]["sample"]}
     elif model_id in ["sd3-medium", "sd3.5-medium"]:
-        io_shapes = {output_name: dynamic_shapes["dynamic_shapes"]["minShapes"]["hidden_states"]}
+        io_shapes = {output_name: trt_dynamic_shapes["minShapes"]["hidden_states"]}
     elif model_id in ["flux-dev", "flux-schnell"]:
         io_shapes = {}
 
     return io_shapes
 
 
-def remove_nesting(dynamic_shapes):
-    dynamic_shapes["dynamic_shapes"]["minShapes"]["text_embeds"] = dynamic_shapes["dynamic_shapes"][
-        "minShapes"
-    ].pop("added_cond_kwargs.text_embeds")
-    dynamic_shapes["dynamic_shapes"]["minShapes"]["time_ids"] = dynamic_shapes["dynamic_shapes"][
-        "minShapes"
-    ].pop("added_cond_kwargs.time_ids")
-    dynamic_shapes["dynamic_shapes"]["optShapes"]["text_embeds"] = dynamic_shapes["dynamic_shapes"][
-        "optShapes"
-    ].pop("added_cond_kwargs.text_embeds")
-    dynamic_shapes["dynamic_shapes"]["optShapes"]["time_ids"] = dynamic_shapes["dynamic_shapes"][
-        "optShapes"
-    ].pop("added_cond_kwargs.time_ids")
+def remove_nesting(trt_dynamic_shapes):
+    trt_dynamic_shapes["minShapes"]["text_embeds"] = trt_dynamic_shapes["minShapes"].pop(
+        "added_cond_kwargs.text_embeds"
+    )
+    trt_dynamic_shapes["minShapes"]["time_ids"] = trt_dynamic_shapes["minShapes"].pop(
+        "added_cond_kwargs.time_ids"
+    )
+    trt_dynamic_shapes["optShapes"]["text_embeds"] = trt_dynamic_shapes["optShapes"].pop(
+        "added_cond_kwargs.text_embeds"
+    )
+    trt_dynamic_shapes["optShapes"]["time_ids"] = trt_dynamic_shapes["optShapes"].pop(
+        "added_cond_kwargs.time_ids"
+    )
 
 
 def save_onnx(onnx_model, output):
diff --git a/examples/diffusers/requirements.txt b/examples/diffusers/requirements.txt
@@ -1,5 +1,4 @@
-cuda-python
-diffusers
+cuda-python<13
 nvtx
 opencv-python>=4.8.1.78,<4.12.0.88
 sentencepiece