revert diffusion_trt.py

Edwardf0t1 · Edwardf0t1 · commit 57d388eed674 · 2025-10-24T03:09:01.000Z
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/examples/diffusers/quantization/diffusion_trt.py b/examples/diffusers/quantization/diffusion_trt.py
@@ -23,7 +23,6 @@
     update_dynamic_axes,
 )
 from quantize import ModelType, PipelineManager
-from tqdm import tqdm
 
 import modelopt.torch.opt as mto
 from modelopt.torch._deploy._runtime import RuntimeRegistry
@@ -59,59 +58,6 @@ def generate_image(pipe, prompt, image_name):
     print(f"Image generated saved as {image_name}")
 
 
-def benchmark_model(
-    pipe, prompt, num_warmup=10, num_runs=50, num_inference_steps=20, model_dtype="Half"
-):
-    """Benchmark the backbone model inference time."""
-    backbone = pipe.transformer if hasattr(pipe, "transformer") else pipe.unet
-
-    backbone_times = []
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    def forward_pre_hook(_module, _input):
-        start_event.record()
-
-    def forward_hook(_module, _input, _output):
-        end_event.record()
-        torch.cuda.synchronize()
-        backbone_times.append(start_event.elapsed_time(end_event))
-
-    pre_handle = backbone.register_forward_pre_hook(forward_pre_hook)
-    post_handle = backbone.register_forward_hook(forward_hook)
-
-    try:
-        print(f"Starting warmup: {num_warmup} runs")
-        for _ in tqdm(range(num_warmup), desc="Warmup"):
-            with torch.amp.autocast("cuda", dtype=dtype_map[model_dtype]):
-                _ = pipe(
-                    prompt,
-                    output_type="pil",
-                    num_inference_steps=num_inference_steps,
-                    generator=torch.Generator("cuda").manual_seed(42),
-                )
-
-        backbone_times.clear()
-
-        print(f"Starting benchmark: {num_runs} runs")
-        for _ in tqdm(range(num_runs), desc="Benchmark"):
-            with torch.amp.autocast("cuda", dtype=dtype_map[model_dtype]):
-                _ = pipe(
-                    prompt,
-                    output_type="pil",
-                    num_inference_steps=num_inference_steps,
-                    generator=torch.Generator("cuda").manual_seed(42),
-                )
-    finally:
-        pre_handle.remove()
-        post_handle.remove()
-
-    total_backbone_time = sum(backbone_times)
-    avg_latency = total_backbone_time / (num_runs * num_inference_steps)
-    print(f"Inference latency of the torch backbone: {avg_latency:.2f} ms")
-    return avg_latency
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -146,24 +92,15 @@ def main():
         "--onnx-load-path", type=str, default="", help="Path to load the ONNX model"
     )
     parser.add_argument(
-        "--trt-engine-load-path", type=str, default=None, help="Path to load the TensorRT engine"
+        "--trt-engine-load-path", type=str, default=None, help="Path to load the TRT engine"
     )
     parser.add_argument(
         "--dq-only", action="store_true", help="Converts the ONNX model to a dq_only model"
     )
     parser.add_argument(
-        "--torch",
-        action="store_true",
-        help="Use the torch pipeline for image generation or benchmarking",
+        "--torch", action="store_true", help="Generate an image using the torch pipeline"
     )
     parser.add_argument("--save-image-as", type=str, default=None, help="Name of the image to save")
-    parser.add_argument(
-        "--benchmark", action="store_true", help="Benchmark the model backbone inference time"
-    )
-    parser.add_argument(
-        "--torch-compile", action="store_true", help="Use torch.compile() on the backbone model"
-    )
-    parser.add_argument("--skip-image", action="store_true", help="Skip image generation")
     args = parser.parse_args()
 
     image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
@@ -188,25 +125,13 @@ def main():
     if args.restore_from:
         mto.restore(backbone, args.restore_from)
 
-    if args.torch_compile:
-        assert args.model_dtype in ["BFloat16", "Float", "Half"], (
-            "torch.compile() only supports BFloat16 and Float"
-        )
-        print("Compiling backbone with torch.compile()...")
-        backbone = torch.compile(backbone, mode="max-autotune")
-
     if args.torch:
         if hasattr(pipe, "transformer"):
             pipe.transformer = backbone
         elif hasattr(pipe, "unet"):
             pipe.unet = backbone
         pipe.to("cuda")
-
-        if args.benchmark:
-            benchmark_model(pipe, args.prompt, model_dtype=args.model_dtype)
-
-        if not args.skip_image:
-            generate_image(pipe, args.prompt, image_name)
+        generate_image(pipe, args.prompt, image_name)
         return
 
     backbone.to("cuda")
@@ -286,14 +211,10 @@ def main():
         raise ValueError("Pipeline does not have a transformer or unet backbone")
     pipe.to("cuda")
 
-    if not args.skip_image:
-        generate_image(pipe, args.prompt, image_name)
-        print(f"Image generated using {args.model} model saved as {image_name}")
+    generate_image(pipe, args.prompt, image_name)
+    print(f"Image generated using {args.model} model saved as {image_name}")
 
-    if args.benchmark:
-        print(
-            f"Inference latency of the TensorRT optimized backbone: {device_model.get_latency()} ms"
-        )
+    print(f"Inference latency of the backbone of the pipeline is {device_model.get_latency()} ms")
 
 
 if __name__ == "__main__":