Add Flux benchmark

StrongerXi · StrongerXi · commit f0b2a09591de · 2025-11-19T13:25:58.000-08:00
This adds a benchmark for the Flux image generation pipeline. Specifically, it only benchmarks the diffusion transformer (and omits the text encoder and vae, which don't take up much time for the e2e generation in Flux). Needs pytorch/pytorch#168176 to run in pytorch repo: ``` python ./benchmarks/dynamo/torchbench.py --accuracy --inference --backend=inductor --only flux python ./benchmarks/dynamo/torchbench.py --performance --inference --backend=inductor --only flux ```
diff --git a/torchbenchmark/models/flux/__init__.py b/torchbenchmark/models/flux/__init__.py
@@ -0,0 +1,72 @@
+import torch
+from torchbenchmark.tasks import COMPUTER_VISION
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceAuthMixin
+from torchbenchmark.util.model import BenchmarkModel
+
+from .install import load_model_checkpoint
+
+
+class Model(BenchmarkModel, HuggingFaceAuthMixin):
+    task = COMPUTER_VISION.GENERATION
+
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    ALLOW_CUSTOMIZE_BSIZE = False
+    # Skip deepcopy because it will oom on A100 40GB
+    DEEPCOPY = False
+    # Default eval precision on CUDA device is fp16
+    DEFAULT_EVAL_CUDA_PRECISION = "fp16"
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(
+            test=test, device=device, batch_size=batch_size, extra_args=extra_args
+        )
+        self.pipe = load_model_checkpoint()
+        self.example_inputs = {
+            "prompt": "A cat holding a sign that says hello world",
+            "height": 1024,
+            "width": 1024,
+            "guidance_scale": 3.5,
+            "num_inference_steps": 50,
+            "max_sequence_length": 512,
+            "generator": torch.Generator("cpu").manual_seed(0),
+        }
+        self.pipe.to(self.device)
+
+    def enable_fp16(self):
+        # This model uses fp16 by default
+        # Make this function no-op.
+        pass
+
+    def get_module(self):
+        # A common configuration:
+        # - resolution = 1024x1024
+        # - maximum sequence length = 512
+        #
+        # The easiest way to get these metadata is probably to run the pipeline
+        # with the example inputs, and then breakpoint at the transformer module
+        # forward and print out the input tensor metadata.
+        inputs = {
+            "hidden_states": torch.randn(1, 4096, 64, device=self.device, dtype=torch.bfloat16),
+            "encoder_hidden_states": torch.randn(1, 512, 4096, device=self.device, dtype=torch.bfloat16),
+            "pooled_projections": torch.randn(1, 768, device=self.device, dtype=torch.bfloat16),
+            "img_ids": torch.ones(1, 512, 3, device=self.device, dtype=torch.bfloat16),
+            "txt_ids": torch.ones(1, 4096, 3, device=self.device, dtype=torch.bfloat16),
+            "timestep": torch.tensor([1.0], device=self.device, dtype=torch.bfloat16),
+            "guidance": torch.tensor([1.0], device=self.device, dtype=torch.bfloat16),
+        }
+
+        return self.pipe.transformer, inputs
+
+    def set_module(self, mod):
+        self.pipe.transformer = mod
+
+    def train(self):
+        raise NotImplementedError(
+            "Train test is not implemented for the stable diffusion model."
+        )
+
+    def eval(self):
+        image = self.pipe(**self.example_inputs)
+        return (image,)
diff --git a/torchbenchmark/models/flux/install.py b/torchbenchmark/models/flux/install.py
@@ -0,0 +1,27 @@
+import os
+import warnings
+
+import torch
+from torchbenchmark.util.framework.diffusers import install_diffusers
+
+MODEL_NAME = "black-forest-labs/FLUX.1-dev"
+
+
+def load_model_checkpoint():
+    from diffusers import FluxPipeline
+
+    pipe = FluxPipeline.from_pretrained(
+        MODEL_NAME, torch_dtype=torch.bfloat16, safety_checker=None
+    )
+
+    return pipe
+
+
+if __name__ == "__main__":
+    install_diffusers()
+    if not "HUGGING_FACE_HUB_TOKEN" in os.environ:
+        warnings.warn(
+            "Make sure to set `HUGGINGFACE_HUB_TOKEN` so you can download weights"
+        )
+    else:
+        load_model_checkpoint()
diff --git a/torchbenchmark/models/flux/metadata.yaml b/torchbenchmark/models/flux/metadata.yaml
@@ -0,0 +1,10 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 32
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+train_benchmark: false
+train_deterministic: false
+not_implemented:
+- device: cpu