[feature] Support Flux TensorRT Pipeline

vuongminh1907 · vuongminh1907 · commit a93116cbf12d · 2025-08-22T21:28:19.000+07:00
diff --git a/examples/flux-tensorrt/README.md b/examples/flux-tensorrt/README.md
@@ -0,0 +1,121 @@
+# 🚀 Flux TensorRT Pipelines
+
+This project provides **TensorRT-accelerated pipelines** for Flux models, enabling **faster inference** with static and dynamic shapes.
+
+## ✅ Supported Pipelines
+- ✅ `FluxPipeline` (Supported)
+- ⏳ `FluxImg2ImgPipeline` (Coming soon)
+- ⏳ `FluxInpaintPipeline` (Coming soon)
+- ⏳ `FluxFillPipeline` (Coming soon)
+- ⏳ `FluxKontextPipeline` (Coming soon)
+- ⏳ `FluxKontextInpaintPipeline` (Coming soon)
+
+---
+
+## ⚙️ Building Flux with TensorRT
+
+We follow the official [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) repository to build TensorRT.
+
+> **Note:**  
+> TensorRT was originally built with `diffusers==0.31.1`.  
+> Currently, we recommend using:
+> - one **venv** for building, and  
+> - another **venv** for inference.  
+
+(🔜 TODO: Build scripts for the latest `diffusers` will be added later.)
+
+### Installation
+```bash
+git clone https://github.com/NVIDIA/TensorRT
+cd TensorRT/demo/Diffusion
+
+pip install tensorrt-cu12==10.13.2.6
+pip install -r requirements.txt
+```
+
+### ⚡ Fast Building with Static Shapes
+```bash
+# BF16
+python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --bf16 --download-onnx-models
+
+# FP8
+python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --quantization-level 4 --fp8 --download-onnx-models
+
+# FP4
+python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --fp4 --download-onnx-models
+```
+
+- To build with dynamic shape, add: `--build-dynamic-shape`.
+- To build with static batch, add  `--build-static-batch`.
+
+ℹ️ For more details, run:
+`python demo_txt2img_flux.py --help`
+
+## 🖼️ Inference with Flux TensorRT
+Create a new venv (or update diffusers, peft in your existing one), then run fast inference using TensorRT engines.
+
+Example: Full Pipeline with All Engines
+
+```python
+from pipeline_flux_trt import FluxPipelineTRT
+from cuda import cudart
+import torch
+
+from module.transformers import FluxTransformerModel 
+from module.vae import VAEModel
+from module.t5xxl import T5XXLModel
+from module.clip import CLIPModel
+import time
+
+# Local path for each engine
+engine_transformer_path = "path/to/transformer/engine_trt10.13.2.6.plan"
+engine_vae_path = "path/to/vae/engine_trt10.13.2.6.plan"
+engine_t5xxl_path = "path/to/t5/engine_trt10.13.2.6.plan"
+engine_clip_path = "path/to/clip/engine_trt10.13.2.6.plan"
+
+# Create stream for each engine
+stream = cudart.cudaStreamCreate()[1]
+
+# Create engine for each model
+engine_transformer = FluxTransformerModel(engine_transformer_path, stream)
+engine_vae = VAEModel(engine_vae_path, stream)
+engine_t5xxl = T5XXLModel(engine_t5xxl_path, stream)
+engine_clip = CLIPModel(engine_clip_path, stream)
+
+# Create pipeline
+pipeline = FluxPipelineTRT.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", 
+            torch_dtype=torch.bfloat16, 
+            engine_transformer=engine_transformer,
+            engine_vae=engine_vae,
+            engine_text_encoder=engine_clip,
+            engine_text_encoder_2= engine_t5xxl,
+            )
+pipeline.to("cuda")
+
+
+prompt = "A cat holding a sign that says hello world"
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipeline(prompt, num_inference_steps=28, guidance_scale=3.0, generator=generator).images[0]
+
+
+image.save("test_pipeline.png")
+```
+
+Example: Transformer Only (Other Modules on Torch)
+```python
+pipeline = FluxPipelineTRT.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", 
+            torch_dtype=torch.bfloat16, 
+            engine_transformer=engine_transformer,
+            )
+pipeline.to("cuda")
+```
+
+## 📌 Notes
+
+- Ensure correct CUDA / TensorRT versions are installed.
+
+- Always match the `.plan` engine files with the TensorRT version used for building.
+
+- For best performance, prefer static shapes unless dynamic batching is required.
diff --git a/examples/flux-tensorrt/module/clip.py b/examples/flux-tensorrt/module/clip.py
@@ -0,0 +1,21 @@
+from .engine import Engine
+
+class CLIPModel(Engine):
+    def __init__(self, engine_path: str, stream = None):
+        super().__init__(engine_path, stream)
+        self.text_maxlen = 77
+        self.embedding_dim = 768
+        self.keep_pooled_output = True
+
+        # Load engine before
+        self.load_engine()
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        output = {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+        }
+        if self.keep_pooled_output:
+            output["pooled_embeddings"] = (batch_size, self.embedding_dim)
+        return output
diff --git a/examples/flux-tensorrt/module/engine.py b/examples/flux-tensorrt/module/engine.py
@@ -0,0 +1,83 @@
+import gc
+from cuda import cudart
+import torch
+import tensorrt as trt
+
+from collections import OrderedDict
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.trt import engine_from_bytes
+
+trt_to_torch_dtype_dict = {
+    trt.DataType.BOOL: torch.bool,
+    trt.DataType.UINT8: torch.uint8,
+    trt.DataType.INT8: torch.int8,
+    trt.DataType.INT32: torch.int32,
+    trt.DataType.INT64: torch.int64,
+    trt.DataType.HALF: torch.float16,
+    trt.DataType.FLOAT: torch.float32,
+    trt.DataType.BF16: torch.bfloat16,
+}
+
+class Engine:
+    def __init__(self, engine_path: str, stream = None):
+        self.engine_path = engine_path
+        self._binding_indices = {}
+        self.stream = stream
+        self.tensors = OrderedDict()
+
+    def load_engine(self,stream = None):
+        self.engine_bytes_cpu = bytes_from_path(self.engine_path)
+        self.engine = engine_from_bytes(self.engine_bytes_cpu)
+        self.context = self.engine.create_execution_context()
+
+        if stream is None:
+            self.stream = cudart.cudaStreamCreate()[1]
+
+    def allocate_buffers(self, shape_dict=None, device="cuda"):
+        for binding in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(binding)
+            if shape_dict and name in shape_dict:
+                shape = shape_dict[name]
+            else:
+                shape = self.engine.get_tensor_shape(name)
+                print(
+                    f"[W]: {self.engine_path}: Could not find '{name}' in shape dict {shape_dict}.  Using shape {shape} inferred from the engine."
+                )
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                self.context.set_input_shape(name, shape)
+            dtype = trt_to_torch_dtype_dict[self.engine.get_tensor_dtype(name)]
+            tensor = torch.empty(tuple(shape), dtype=dtype).to(device=device)
+            self.tensors[name] = tensor
+
+    def infer(self, feed_dict, stream):
+        for name, buf in feed_dict.items():
+            self.tensors[name].copy_(buf)
+
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+
+        noerror = self.context.execute_async_v3(stream)
+        if not noerror:
+            raise ValueError(f"ERROR: inference of {self.engine_path} failed.")
+
+        return self.tensors
+
+    def unload_engine(self):
+        del self.engine
+        self.engine = None
+        gc.collect()
+
+    def get_shape_dict(self):
+        pass
+
+    def check_dims(self, batch_size, image_height, image_width, compression_factor = 8, min_batch = 1, max_batch = 16, min_latent_shape = 16, max_latent_shape = 1024):
+        assert batch_size >= min_batch and batch_size <= max_batch
+        latent_height = image_height // compression_factor
+        latent_width = image_width // compression_factor
+        assert latent_height >= min_latent_shape and latent_height <= max_latent_shape
+        assert latent_width >= min_latent_shape and latent_width <= max_latent_shape
+        return (latent_height, latent_width)
+
+
+    
+
diff --git a/examples/flux-tensorrt/module/t5xxl.py b/examples/flux-tensorrt/module/t5xxl.py
@@ -0,0 +1,18 @@
+from .engine import Engine
+
+class T5XXLModel(Engine):
+    def __init__(self, engine_path: str, stream = None):
+        super().__init__(engine_path, stream)
+        self.text_maxlen = 512
+        self.d_model = 4096
+
+        # Load engine before
+        self.load_engine()
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        output = {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.d_model),
+        }
+        return output
diff --git a/examples/flux-tensorrt/module/transformers.py b/examples/flux-tensorrt/module/transformers.py
@@ -0,0 +1,24 @@
+from .engine import Engine
+
+class FluxTransformerModel(Engine):
+    def __init__(self, engine_path: str, stream = None):
+        super().__init__(engine_path, stream)
+        self.in_channels = 64
+
+        # Load engine before
+        self.load_engine()
+
+    def get_shape_dict(self,batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        shape_dict = {
+            "hidden_states": (batch_size, (latent_height // 2) * (latent_width // 2), 64),
+            "encoder_hidden_states": (batch_size, 512, 4096),
+            "pooled_projections": (batch_size, 768),
+            "timestep": (batch_size,),
+            "img_ids": ((latent_height // 2) * (latent_width // 2), 3),
+            "txt_ids": (512, 3),
+            "latent": (batch_size, (latent_height // 2) * (latent_width // 2), 64),
+            "guidance": (batch_size,),
+        }
+
+        return shape_dict
diff --git a/examples/flux-tensorrt/module/vae.py b/examples/flux-tensorrt/module/vae.py
@@ -0,0 +1,18 @@
+from .engine import Engine
+
+class VAEModel(Engine):
+    def __init__(self, engine_path: str, stream = None):
+        super().__init__(engine_path, stream)
+        self.latent_channels = 16
+        self.scaling_factor = 0.3611
+        self.shift_factor = 0.1159
+
+        # Load engine before
+        self.load_engine()
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "latent": (batch_size, self.latent_channels, latent_height, latent_width),
+            "images": (batch_size, 3, image_height, image_width),
+        }
diff --git a/examples/flux-tensorrt/pipeline_flux_trt.py b/examples/flux-tensorrt/pipeline_flux_trt.py