fixes.

sayakpaul · sayakpaul · commit 31e34d5e3ebf · 2025-05-16T13:33:10.000+05:30
diff --git a/benchmarks/benchmarking_flux.py b/benchmarks/benchmarking_flux.py
@@ -1,43 +1,97 @@
+from functools import partial
+
 import torch
-from benchmarking_utils import BenchmarkMixin
+from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
 
-from diffusers import FluxTransformer2DModel
+from diffusers import BitsAndBytesConfig, FluxTransformer2DModel
 from diffusers.utils.testing_utils import torch_device
 
 
-class BenchmarkFlux(BenchmarkMixin):
-    model_class = FluxTransformer2DModel
-    compile_kwargs = {"fullgraph": True}
-
-    def get_model_init_dict(self):
-        return {
-            "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
-            "subfolder": "transformer",
-            "torch_dtype": torch.bfloat16,
-        }
-
-    def initialize_model(self):
-        model = self.model_class.from_pretrained(**self.get_model_init_dict())
-        model = model.to(torch_device).eval()
-        return model
-
-    def get_input_dict(self):
-        # resolution: 1024x1024
-        # maximum sequence length 512
-        hidden_states = torch.randn(1, 4096, 64, device=torch_device, dtype=torch.bfloat16)
-        encoder_hidden_states = torch.randn(1, 512, 4096, device=torch_device, dtype=torch.bfloat16)
-        pooled_prompt_embeds = torch.randn(1, 768, device=torch_device, dtype=torch.bfloat16)
-        image_ids = torch.ones(512, 3, device=torch_device, dtype=torch.bfloat16)
-        text_ids = torch.ones(4096, 3, device=torch_device, dtype=torch.bfloat16)
-        timestep = torch.tensor([1.0], device=torch_device, dtype=torch.bfloat16)
-        guidance = torch.tensor([1.0], device=torch_device, dtype=torch.bfloat16)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "pooled_projections": pooled_prompt_embeds,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
+CKPT_ID = "black-forest-labs/FLUX.1-dev"
+
+
+def get_input_dict(**device_dtype_kwargs):
+    # resolution: 1024x1024
+    # maximum sequence length 512
+    hidden_states = torch.randn(1, 4096, 64, **device_dtype_kwargs)
+    encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs)
+    pooled_prompt_embeds = torch.randn(1, 768, **device_dtype_kwargs)
+    image_ids = torch.ones(512, 3, **device_dtype_kwargs)
+    text_ids = torch.ones(4096, 3, **device_dtype_kwargs)
+    timestep = torch.tensor([1.0], **device_dtype_kwargs)
+    guidance = torch.tensor([1.0], **device_dtype_kwargs)
+
+    return {
+        "hidden_states": hidden_states,
+        "encoder_hidden_states": encoder_hidden_states,
+        "img_ids": image_ids,
+        "txt_ids": text_ids,
+        "pooled_projections": pooled_prompt_embeds,
+        "timestep": timestep,
+        "guidance": guidance,
+    }
+
+
+if __name__ == "__main__":
+    scenarios = [
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bf16",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+            compile_kwargs={"fullgraph": True},
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bnb-nf4",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4"
+                ),
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-layerwise-upcasting",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-group-offload-leaf",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(
+                model_init_fn,
+                group_offload_kwargs={
+                    "onload_device": torch_device,
+                    "offload_device": torch.device("cpu"),
+                    "offload_type": "leaf_level",
+                    "use_stream": True,
+                    "non_blocking": True,
+                },
+            ),
+        ),
+    ]
+
+    runner = BenchmarkMixin()
+    runner.run_bencmarks_and_collate(scenarios, filename="flux.csv")
diff --git a/benchmarks/benchmarking_utils.py b/benchmarks/benchmarking_utils.py
@@ -1,12 +1,14 @@
 import gc
+from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Union
 
+import pandas as pd
 import torch
 import torch.utils.benchmark as benchmark
 
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import require_torch_gpu, torch_device
 
 
 def benchmark_fn(f, *args, **kwargs):
@@ -25,13 +27,26 @@ def flush():
     torch.cuda.reset_peak_memory_stats()
 
 
+def model_init_fn(model_cls, group_offload_kwargs=None, layerwise_upcasting=False, **init_kwargs):
+    model = model_cls.from_pretrained(**init_kwargs).eval()
+    if group_offload_kwargs and isinstance(group_offload_kwargs, dict):
+        model.enable_group_offload(**group_offload_kwargs)
+    else:
+        model.to(torch_device)
+    if layerwise_upcasting:
+        model.enable_layerwise_casting(
+            storage_dtype=torch.float8_e4m3fn, compute_dtype=init_kwargs.get("torch_dtype", torch.bfloat16)
+        )
+    return model
+
+
 @dataclass
 class BenchmarkScenario:
     name: str
     model_cls: ModelMixin
     model_init_kwargs: Dict[str, Any]
     model_init_fn: Callable
-    get_model_input_dict: Callable[[], Dict[str, Any]]
+    get_model_input_dict: Callable
     compile_kwargs: Optional[Dict[str, Any]] = None
 
 
@@ -50,54 +65,71 @@ def post_benchmark(self, model):
     def run_benchmark(self, scenario: BenchmarkScenario):
         # 1) plain stats
         plain = self._run_phase(
+            model_cls=scenario.model_cls,
             init_fn=scenario.model_init_fn,
             init_kwargs=scenario.model_init_kwargs,
             get_input_fn=scenario.get_model_input_dict,
             compile_kwargs=None,
         )
 
         # 2) compiled stats (if any)
-        compiled = None
+        compiled = {"time": None, "memory": None}
         if scenario.compile_kwargs:
             compiled = self._run_phase(
+                model_cls=scenario.model_cls,
                 init_fn=scenario.model_init_fn,
                 init_kwargs=scenario.model_init_kwargs,
                 get_input_fn=scenario.get_model_input_dict,
                 compile_kwargs=scenario.compile_kwargs,
             )
 
         # 3) merge
-        result = {"scenario": scenario.name, "time_plain_s": plain["time"], "mem_plain_GB": plain["memory"]}
-        if compiled:
-            result.update(
-                {
-                    "time_compile_s": compiled["time"],
-                    "mem_compile_GB": compiled["memory"],
-                }
-            )
+        result = {
+            "scenario": scenario.name,
+            "model_cls": scenario.model_cls.__name__,
+            "time_plain_s": plain["time"],
+            "mem_plain_GB": plain["memory"],
+            "time_compile_s": compiled["time"],
+            "mem_compile_GB": compiled["memory"],
+        }
+        if scenario.compile_kwargs:
+            result["fullgraph"] = scenario.compile_kwargs.get("fullgraph", False)
+            result["mode"] = scenario.compile_kwargs.get("mode", "default")
+        else:
+            result["fullgraph"], result["mode"] = None, None
         return result
 
+    def run_bencmarks_and_collate(self, scenarios: Union[BenchmarkScenario, list[BenchmarkScenario]], filename: str):
+        if not isinstance(scenarios, list):
+            scenarios = [scenarios]
+        records = [self.run_benchmark(s) for s in scenarios]
+        df = pd.DataFrame.from_records(records)
+        df.to_csv(filename, index=False)
+
     def _run_phase(
         self,
         *,
-        init_fn: Callable[..., Any],
+        model_cls: ModelMixin,
+        init_fn: Callable,
         init_kwargs: Dict[str, Any],
-        get_input_fn: Callable[[], Dict[str, torch.Tensor]],
+        get_input_fn: Callable,
         compile_kwargs: Optional[Dict[str, Any]],
     ) -> Dict[str, float]:
         # setup
         self.pre_benchmark()
 
         # init & (optional) compile
-        model = init_fn(**init_kwargs)
+        model = init_fn(model_cls, **init_kwargs)
         if compile_kwargs:
             model.compile(**compile_kwargs)
 
         # build inputs
         inp = get_input_fn()
 
         # measure
-        time_s = benchmark_fn(lambda m, d: m(**d), model, inp)
+        run_ctx = torch._inductor.utils.fresh_inductor_cache() if compile_kwargs else nullcontext()
+        with run_ctx:
+            time_s = benchmark_fn(lambda m, d: m(**d), model, inp)
         mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
         mem_gb = round(mem_gb, 2)