[NVBUG: 5619158] Optimize memory usage for diffusion_trt.py (#547)

ajrasane · kevalmorabia97 · commit f329b19d4d05 · 2025-11-13T10:33:07.000+05:30
## What does this PR do? **Type of change:** Minor code change **Overview:** - Delete backbone after Device Model creation - Add assertion for torch compile - Update dummy input generation function ## Testing ``` python diffusion_trt.py --model flux-dev --benchmark --skip-image python diffusion_trt.py --model flux-dev --benchmark --skip-image --restore-from ./flux_dev_fp8_autodeploy_fake.pt python diffusion_trt.py --model flux-dev --benchmark --skip-image --restore-from ./flux_dev_fp4_autodeploy_fake.pt python diffusion_trt.py --model flux-dev --benchmark --skip-image --torch python diffusion_trt.py --model flux-dev --benchmark --skip-image --restore-from ./flux_dev_fp8_autodeploy_fake.pt --torch python diffusion_trt.py --model flux-dev --benchmark --skip-image --restore-from ./flux_dev_fp4_autodeploy_fake.pt --torch python diffusion_trt.py --model flux-dev --benchmark --skip-image --torch --torch-compile ``` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: No  --------- Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
diff --git a/examples/diffusers/quantization/diffusion_trt.py b/examples/diffusers/quantization/diffusion_trt.py
@@ -172,6 +172,8 @@ def main():
         override_model_path=args.override_model_path,
     )
 
+    if args.torch_compile:
+        assert args.torch, "Torch mode must be enabled when torch_compile is used"
     # Save the backbone of the pipeline and move it to the GPU
     add_embedding = None
     backbone = None
@@ -186,11 +188,10 @@ def main():
     if args.restore_from:
         mto.restore(backbone, args.restore_from)
 
-    if args.torch_compile:
-        print("Compiling backbone with torch.compile()...")
-        backbone = torch.compile(backbone, mode="max-autotune")
-
     if args.torch:
+        if args.torch_compile:
+            print("Compiling backbone with torch.compile()...")
+            backbone = torch.compile(backbone, mode="max-autotune")
         if hasattr(pipe, "transformer"):
             pipe.transformer = backbone
         elif hasattr(pipe, "unet"):
@@ -250,9 +251,15 @@ def main():
         dq_only=args.dq_only,
     )
 
+    # Delete the original backbone and empty the cache
+    del backbone
+    torch.cuda.empty_cache()
+
     if not args.trt_engine_load_path:
         # Compile the TRT engine from the exported ONNX model
         compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
+        # Clear onnx_bytes to free memory
+        del onnx_bytes
         # Save TRT engine for future use
         with open(f"{args.model}.plan", "wb") as f:
             # Remove the SHA-256 hash from the compiled model, used to maintain state in the trt_client
@@ -276,8 +283,7 @@ def main():
     if hasattr(pipe, "unet") and add_embedding:
         setattr(device_model, "add_embedding", add_embedding)
 
-    # Move the backbone back to the CPU and set the backbone to the compiled device model
-    backbone.to("cpu")
+    # Set the backbone to the device model
     if hasattr(pipe, "unet"):
         pipe.unet = device_model
     elif hasattr(pipe, "transformer"):
diff --git a/examples/diffusers/quantization/onnx_utils/export.py b/examples/diffusers/quantization/onnx_utils/export.py
@@ -128,7 +128,9 @@ def generate_fp8_scales(backbone):
 
 
 def _gen_dummy_inp_and_dyn_shapes_sdxl(backbone, min_bs=1, opt_bs=1):
-    assert isinstance(backbone, UNet2DConditionModel)
+    assert isinstance(backbone, UNet2DConditionModel) or isinstance(
+        backbone._orig_mod, UNet2DConditionModel
+    )
     cfg = backbone.config
     assert cfg.addition_embed_type == "text_time"
 
@@ -173,7 +175,9 @@ def _gen_dummy_inp_and_dyn_shapes_sdxl(backbone, min_bs=1, opt_bs=1):
 
 
 def _gen_dummy_inp_and_dyn_shapes_sd3(backbone, min_bs=1, opt_bs=1):
-    assert isinstance(backbone, SD3Transformer2DModel)
+    assert isinstance(backbone, SD3Transformer2DModel) or isinstance(
+        backbone._orig_mod, SD3Transformer2DModel
+    )
     cfg = backbone.config
 
     dynamic_shapes = {
@@ -205,7 +209,9 @@ def _gen_dummy_inp_and_dyn_shapes_sd3(backbone, min_bs=1, opt_bs=1):
 
 
 def _gen_dummy_inp_and_dyn_shapes_flux(backbone, min_bs=1, opt_bs=1):
-    assert isinstance(backbone, FluxTransformer2DModel)
+    assert isinstance(backbone, FluxTransformer2DModel) or isinstance(
+        backbone._orig_mod, FluxTransformer2DModel
+    )
     cfg = backbone.config
     text_maxlen = 512
     img_dim = 4096
@@ -251,7 +257,9 @@ def _gen_dummy_inp_and_dyn_shapes_flux(backbone, min_bs=1, opt_bs=1):
 
 
 def _gen_dummy_inp_and_dyn_shapes_ltx(backbone, min_bs=2, opt_bs=2):
-    assert isinstance(backbone, LTXVideoTransformer3DModel)
+    assert isinstance(backbone, LTXVideoTransformer3DModel) or isinstance(
+        backbone._orig_mod, LTXVideoTransformer3DModel
+    )
     cfg = backbone.config
     dtype = backbone.dtype
     video_dim = 2240
diff --git a/tests/examples/diffusers/test_diffusers.py b/tests/examples/diffusers/test_diffusers.py
@@ -150,3 +150,41 @@ def test_diffusers_quantization(
     model.quantize(tmp_path)
     model.restore(tmp_path)
     model.inference(tmp_path)
+
+
+@pytest.mark.parametrize(
+    ("model_name", "model_path", "torch_compile"),
+    [
+        ("flux-schnell", FLUX_SCHNELL_PATH, False),
+        ("flux-schnell", FLUX_SCHNELL_PATH, True),
+        ("sd3-medium", SD3_PATH, False),
+        ("sd3-medium", SD3_PATH, True),
+        ("sdxl-1.0", SDXL_1_0_PATH, False),
+        ("sdxl-1.0", SDXL_1_0_PATH, True),
+    ],
+    ids=[
+        "flux_schnell_torch",
+        "flux_schnell_torch_compile",
+        "sd3_medium_torch",
+        "sd3_medium_torch_compile",
+        "sdxl_1.0_torch",
+        "sdxl_1.0_torch_compile",
+    ],
+)
+def test_diffusion_trt_torch(
+    model_name: str,
+    model_path: str,
+    torch_compile: bool,
+) -> None:
+    cmd_args = [
+        "python",
+        "diffusion_trt.py",
+        "--model",
+        model_name,
+        "--override-model-path",
+        model_path,
+        "--torch",
+    ]
+    if torch_compile:
+        cmd_args.append("--torch-compile")
+    run_example_command(cmd_args, "diffusers/quantization")