[update] build TRT code

vuongminh1907 · vuongminh1907 · commit ac0beba6fb89 · 2025-08-25T11:01:22.000Z
diff --git a/examples/flux-tensorrt/README.md b/examples/flux-tensorrt/README.md
@@ -12,48 +12,49 @@ This project provides **TensorRT-accelerated pipelines** for Flux models, enabli
 
 ---
 
-## ⚙️ Building Flux with TensorRT
 
-We follow the official [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) repository to build TensorRT.
-
-> **Note:**  
-> TensorRT was originally built with `diffusers==0.31.1`.  
-> Currently, we recommend using:
-> - one **venv** for building, and  
-> - another **venv** for inference.  
-
-(🔜 TODO: Build scripts for the latest `diffusers` will be added later.)
-
-### Installation
+## Installation
 ```bash
-git clone https://github.com/NVIDIA/TensorRT
-cd TensorRT/demo/Diffusion
-
-pip install tensorrt-cu12==10.13.2.6
+cd diffusers/examples/flux-tensorrt
 pip install -r requirements.txt
 ```
 
-### ⚡ Fast Building with Static Shapes
-```bash
-# BF16
-python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --bf16 --download-onnx-models
+## ⚙️ Build Flux with TensorRT
 
-# FP8
-python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --quantization-level 4 --fp8 --download-onnx-models
+Before building, make sure you have the ONNX checkpoints ready.
+You can either download the official [Flux ONNX](https://huggingface.co/black-forest-labs/FLUX.1-dev-onnx) checkpoints from Hugging Face, or export your own.
 
-# FP4
-python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --fp4 --download-onnx-models
+```bash
+huggingface-cli download black-forest-labs/FLUX.1-dev-onnx --local-dir onnx
 ```
 
-- To build with dynamic shape, add: `--build-dynamic-shape`.
-- To build with static batch, add  `--build-static-batch`.
+Build each component individually. For example, to build the **Transformer engine**:
+```python
+from module.transformers import FluxTransformerModel
+
+engine_path = "checkpoints_trt/transformer/engine.plan"
+engine_transformer = FluxTransformerModel(engine_path=engine_path,build=True)
+
+# Build tranformer engine
+transformer_input_profile = engine_transformer.get_input_profile(
+    opt_batch_size=1,
+    opt_image_height=1024,
+    opt_image_width=1024,
+    static_batch = True,
+    dynamic_shape= True
+)
+engine_transformer.build(
+    onnx_path="onnx/transformer.opt/bf16/model.onnx", #Replace your onnx path
+    input_profile=transformer_input_profile,
+)
+```
 
-ℹ️ For more details, run:
-`python demo_txt2img_flux.py --help`
+You can convert all ONNX checkpoints to TensorRT engines with a single command:
+```bash
+python convert_trt.py
+```
 
 ## 🖼️ Inference with Flux TensorRT
-Create a new venv (or update diffusers, peft in your existing one), then run fast inference using TensorRT engines.
-
 Example: Full Pipeline with All Engines
 
 ```python
@@ -68,10 +69,10 @@ from module.clip import CLIPModel
 import time
 
 # Local path for each engine
-engine_transformer_path = "path/to/transformer/engine_trt10.13.2.6.plan"
-engine_vae_path = "path/to/vae/engine_trt10.13.2.6.plan"
-engine_t5xxl_path = "path/to/t5/engine_trt10.13.2.6.plan"
-engine_clip_path = "path/to/clip/engine_trt10.13.2.6.plan"
+engine_transformer_path = "checkpoints_trt/transformer/engine.plan"
+engine_vae_path = "checkpoints_trt/vae/engine.plan"
+engine_t5xxl_path = "checkpoints_trt/t5/engine.plan"
+engine_clip_path = "checkpoints_trt/clip/engine.plan"
 
 # Create stream for each engine
 stream = cudart.cudaStreamCreate()[1]
diff --git a/examples/flux-tensorrt/convert_trt.py b/examples/flux-tensorrt/convert_trt.py
@@ -0,0 +1,28 @@
+from module.transformers import FluxTransformerModel
+from module.clip import CLIPModel
+from module.t5xxl import T5XXLModel
+from module.vae import VAEModel
+
+models_config = {
+    "transformer": (FluxTransformerModel, "onnx/transformer.opt/bf16/model.onnx"),
+    "clip":        (CLIPModel,          "onnx/clip.opt/model.onnx"),
+    "t5":          (T5XXLModel,         "onnx/t5.opt/model.onnx"),
+    "vae":         (VAEModel,           "onnx/vae.opt/model.onnx"),
+}
+
+engines = {}
+
+for name, (ModelClass, onnx_path) in models_config.items():
+    engine_path = f"checkpoints_trt/{name}/engine.plan"
+    engine = ModelClass(engine_path=engine_path, build=True)
+
+    input_profile = engine.get_input_profile(
+        opt_batch_size=1,
+        opt_image_height=1024,
+        opt_image_width=1024,
+        static_batch=True,
+        dynamic_shape=True,
+    )
+
+    engine.build(onnx_path=onnx_path, input_profile=input_profile)
+    engines[name] = engine
diff --git a/examples/flux-tensorrt/infer_trt.py b/examples/flux-tensorrt/infer_trt.py
@@ -0,0 +1,43 @@
+from pipeline_flux_trt import FluxPipelineTRT
+from cuda import cudart
+import torch
+
+from module.transformers import FluxTransformerModel 
+from module.vae import VAEModel
+from module.t5xxl import T5XXLModel
+from module.clip import CLIPModel
+import time
+
+# Local path for each engine
+engine_transformer_path = "checkpoints_trt/transformer/engine.plan"
+engine_vae_path = "checkpoints_trt/vae/engine.plan"
+engine_t5xxl_path = "checkpoints_trt/t5/engine.plan"
+engine_clip_path = "checkpoints_trt/clip/engine.plan"
+
+# Create stream for each engine
+stream = cudart.cudaStreamCreate()[1]
+
+# Create engine for each model
+engine_transformer = FluxTransformerModel(engine_transformer_path, stream)
+engine_vae = VAEModel(engine_vae_path, stream)
+engine_t5xxl = T5XXLModel(engine_t5xxl_path, stream)
+engine_clip = CLIPModel(engine_clip_path, stream)
+
+# Create pipeline
+pipeline = FluxPipelineTRT.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", 
+            torch_dtype=torch.bfloat16, 
+            engine_transformer=engine_transformer,
+            engine_vae=engine_vae,
+            engine_text_encoder=engine_clip,
+            engine_text_encoder_2= engine_t5xxl,
+            )
+pipeline.to("cuda")
+
+
+prompt = "A cat holding a sign that says hello world"
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipeline(prompt, num_inference_steps=28, guidance_scale=3.0, generator=generator).images[0]
+
+
+image.save("test_pipeline.png")
diff --git a/examples/flux-tensorrt/module/clip.py b/examples/flux-tensorrt/module/clip.py
@@ -1,14 +1,15 @@
 from .engine import Engine
 
 class CLIPModel(Engine):
-    def __init__(self, engine_path: str, stream = None):
+    def __init__(self, engine_path: str, stream = None, build = False):
         super().__init__(engine_path, stream)
         self.text_maxlen = 77
         self.embedding_dim = 768
         self.keep_pooled_output = True
 
-        # Load engine before
-        self.load_engine()
+        if not build:
+            # Load engine before
+            self.load_engine()
 
     def get_shape_dict(self, batch_size, image_height, image_width):
         self.check_dims(batch_size, image_height, image_width)
@@ -18,4 +19,14 @@ def get_shape_dict(self, batch_size, image_height, image_width):
         }
         if self.keep_pooled_output:
             output["pooled_embeddings"] = (batch_size, self.embedding_dim)
-        return output
+        return output
+    
+    def get_input_profile(self, opt_batch_size=1, opt_image_height=1024, opt_image_width=1024, min_batch=1, max_batch=8, min_height=512, max_height=1280, min_width=512, max_width=1280, static_batch=True, dynamic_shape=True):
+        min_batch = opt_batch_size if static_batch else min_batch
+        max_batch = opt_batch_size if static_batch else max_batch
+        
+        return {
+            "input_ids": [(min_batch, self.text_maxlen), (opt_batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+
+        
diff --git a/examples/flux-tensorrt/module/engine.py b/examples/flux-tensorrt/module/engine.py
@@ -2,7 +2,8 @@
 from cuda import cudart
 import torch
 import tensorrt as trt
-
+import subprocess
+from collections import defaultdict
 from collections import OrderedDict
 from polygraphy.backend.common import bytes_from_path
 from polygraphy.backend.trt import engine_from_bytes
@@ -78,6 +79,124 @@ def check_dims(self, batch_size, image_height, image_width, compression_factor =
         assert latent_width >= min_latent_shape and latent_width <= max_latent_shape
         return (latent_height, latent_width)
 
+    def build(
+        self,
+        onnx_path,
+        strongly_typed=False,
+        fp16=False,
+        bf16=True,
+        tf32=True,
+        int8=False,
+        fp8=False,
+        input_profile=None,
+        enable_refit=False,
+        enable_all_tactics=False,
+        timing_cache=None,
+        update_output_names=None,
+        native_instancenorm=True,
+        verbose=False,
+        weight_streaming=False,
+        builder_optimization_level=3,
+        precision_constraints='none',
+    ):
+        print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+
+        # Handle weight streaming case: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#streaming-weights.
+        if weight_streaming:
+            strongly_typed, fp16, bf16, int8, fp8 = True, False, False, False, False
+
+        # Base command
+        build_command = [f"polygraphy convert {onnx_path} --convert-to trt --output {self.engine_path}"]
+
+        # Precision flags
+        build_args = [
+            "--fp16" if fp16 else "",
+            "--bf16" if bf16 else "",
+            "--tf32" if tf32 else "",
+            "--fp8" if fp8 else "",
+            "--int8" if int8 else "",
+            "--strongly-typed" if strongly_typed else "",
+        ]
+
+        # Additional arguments
+        build_args.extend([
+            "--weight-streaming" if weight_streaming else "",
+            "--refittable" if enable_refit else "",
+            "--tactic-sources" if not enable_all_tactics else "",
+            "--onnx-flags native_instancenorm" if native_instancenorm else "",
+            f"--builder-optimization-level {builder_optimization_level}",
+            f"--precision-constraints {precision_constraints}",
+        ])
+
+        # Timing cache
+        if timing_cache:
+            build_args.extend([
+                f"--load-timing-cache {timing_cache}",
+                f"--save-timing-cache {timing_cache}"
+            ])
+
+        # Verbosity setting
+        verbosity = "extra_verbose" if verbose else "error"
+        build_args.append(f"--verbosity {verbosity}")
+
+        # Output names
+        if update_output_names:
+            print(f"Updating network outputs to {update_output_names}")
+            # build_args.append(f"--trt-outputs {' '.join(update_output_names)}")
+            build_args.append(f"--trt-outputs {update_output_names}")
+
+        # Input profiles
+        if input_profile:
+            profile_args = defaultdict(str)
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                profile_args["--trt-min-shapes"] += f"{name}:{str(list(dims[0])).replace(' ', '')} "
+                profile_args["--trt-opt-shapes"] += f"{name}:{str(list(dims[1])).replace(' ', '')} "
+                profile_args["--trt-max-shapes"] += f"{name}:{str(list(dims[2])).replace(' ', '')} "
+
+            build_args.extend(f"{k} {v}" for k, v in profile_args.items())
+
+        # Filter out empty strings and join command
+        build_args = [arg for arg in build_args if arg]
+        final_command = ' '.join(build_command + build_args)
+
+        # Execute command with improved error handling
+        try:
+            print(f"Engine build command: {final_command}")
+            subprocess.run(final_command, check=True, shell=True)
+        except subprocess.CalledProcessError as exc:
+            error_msg = (
+                f"Failed to build TensorRT engine. Error details:\n"
+                f"Command: {exc.cmd}\n"
+            )
+            raise RuntimeError(error_msg) from exc
+        
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape, compression_factor=8, min_batch=1, max_batch=8, min_image_shape=256, max_image_shape=1344, min_latent_shape=16, max_latent_shape=1024):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // compression_factor
+        latent_width = image_width // compression_factor
+        min_image_height = image_height if static_shape else min_image_shape
+        max_image_height = image_height if static_shape else max_image_shape
+        min_image_width = image_width if static_shape else min_image_shape
+        max_image_width = image_width if static_shape else max_image_shape
+        min_latent_height = latent_height if static_shape else min_latent_shape
+        max_latent_height = latent_height if static_shape else max_latent_shape
+        min_latent_width = latent_width if static_shape else min_latent_shape
+        max_latent_width = latent_width if static_shape else max_latent_shape
+        return (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        )
+
 
     
 
diff --git a/examples/flux-tensorrt/module/t5xxl.py b/examples/flux-tensorrt/module/t5xxl.py
@@ -1,18 +1,26 @@
 from .engine import Engine
 
 class T5XXLModel(Engine):
-    def __init__(self, engine_path: str, stream = None):
+    def __init__(self, engine_path: str, stream = None, build = False):
         super().__init__(engine_path, stream)
         self.text_maxlen = 512
         self.d_model = 4096
 
-        # Load engine before
-        self.load_engine()
+        if not build:
+            # Load engine before
+            self.load_engine()
 
     def get_shape_dict(self, batch_size, image_height, image_width):
         self.check_dims(batch_size, image_height, image_width)
         output = {
             "input_ids": (batch_size, self.text_maxlen),
             "text_embeddings": (batch_size, self.text_maxlen, self.d_model),
         }
-        return output
+        return output
+
+    def get_input_profile(self, opt_batch_size=1, opt_image_height=1024, opt_image_width=1024, min_batch=1, max_batch=8, min_height=512, max_height=1280, min_width=512, max_width=1280, static_batch=True, dynamic_shape=True):
+        min_batch = opt_batch_size if static_batch else min_batch
+        max_batch = opt_batch_size if static_batch else max_batch
+        return {
+            "input_ids": [(min_batch, self.text_maxlen), (opt_batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
diff --git a/examples/flux-tensorrt/module/transformers.py b/examples/flux-tensorrt/module/transformers.py
diff --git a/examples/flux-tensorrt/module/vae.py b/examples/flux-tensorrt/module/vae.py
diff --git a/examples/flux-tensorrt/requirements.txt b/examples/flux-tensorrt/requirements.txt