enable flux.1-dev

sayakpaul · sayakpaul · commit 98c6c5293ca7 · 2025-06-11T08:11:32.000+05:30
diff --git a/run_benchmark.py b/run_benchmark.py
@@ -5,22 +5,34 @@
 from utils.benchmark_utils import annotate, create_parser
 from utils.pipeline_utils import load_pipeline  # noqa: E402
 
+def _determine_pipe_call_kwargs(args):
+    kwargs = {"max_sequence_length": 256, "guidance_scale": 0.0}
+    ckpt_id = args.ckpt
+    if ckpt_id == "black-forest-labs/FLUX.1-dev":
+        kwargs = {"max_sequence_length": 512, "guidance_scale": 3.5}
+    return kwargs
 
 def main(args):
     pipeline = load_pipeline(args)
 
     # warmup
     for _ in range(3):
         image = pipeline(
-            args.prompt, num_inference_steps=args.num_inference_steps, guidance_scale=0.0
+            args.prompt, 
+            num_inference_steps=args.num_inference_steps, 
+            generator=torch.manual_seed(0),
+            **_determine_pipe_call_kwargs(args)
         ).images[0]
 
     # run inference 10 times and compute mean / variance
     timings = []
     for _ in range(10):
         begin = time.time()
         image = pipeline(
-            args.prompt, num_inference_steps=args.num_inference_steps, guidance_scale=0.0
+            args.prompt, 
+            num_inference_steps=args.num_inference_steps, 
+            generator=torch.manual_seed(0),
+            **_determine_pipe_call_kwargs(args)
         ).images[0]
         end = time.time()
         timings.append(end - begin)
@@ -46,7 +58,9 @@ def main(args):
         with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
             with record_function("timed_region"):
                 image = pipeline(
-                    args.prompt, num_inference_steps=args.num_inference_steps, guidance_scale=0.0
+                    args.prompt, 
+                    num_inference_steps=args.num_inference_steps, 
+                    **_determine_pipe_call_kwargs(args)
                 ).images[0]
         prof.export_chrome_trace(args.trace_file)
 
diff --git a/utils/pipeline_utils.py b/utils/pipeline_utils.py
@@ -214,7 +214,7 @@ def use_compile(pipeline):
         pipeline.vae.decode, mode="max-autotune", fullgraph=True
     )
 
-    # warmup for a few iterations
+    # warmup for a few iterations (`num_inference_steps` shouldn't matter)
     for _ in range(3):
         pipeline(
             "dummy prompt to trigger torch compilation",
@@ -233,21 +233,23 @@ def download_hosted_file(filename, output_path):
     hf_hub_download(REPO_NAME, filename, local_dir=os.path.dirname(output_path))
 
 
-def use_export_aoti(pipeline, cache_dir, serialize=False):
+def use_export_aoti(pipeline, cache_dir, serialize=False, is_timestep_distilled=True):
     # create cache dir if needed
     pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True)
 
     def _example_tensor(*shape):
         return torch.randn(*shape, device="cuda", dtype=torch.bfloat16)
 
     # === Transformer compile / export ===
+    seq_length = 256 if is_timestep_distilled else 512
+    # these shapes are for 1024x1024 resolution.
     transformer_kwargs = {
         "hidden_states": _example_tensor(1, 4096, 64),
         "timestep": torch.tensor([1.], device="cuda", dtype=torch.bfloat16),
-        "guidance": None,
+        "guidance": None if is_timestep_distilled else torch.tensor([1.], device="cuda", dtype=torch.bfloat16),
         "pooled_projections": _example_tensor(1, 768),
-        "encoder_hidden_states": _example_tensor(1, 512, 4096),
-        "txt_ids": _example_tensor(512, 3),
+        "encoder_hidden_states": _example_tensor(1, seq_length, 4096),
+        "txt_ids": _example_tensor(seq_length, 3),
         "img_ids": _example_tensor(4096, 3),
         "joint_attention_kwargs": {},
         "return_dict": False,
@@ -291,9 +293,7 @@ def _example_tensor(*shape):
     # hack to get around export's limitations
     pipeline.vae.forward = pipeline.vae.decode
 
-    vae_decode_kwargs = {
-        "return_dict": False,
-    }
+    vae_decode_kwargs = {"return_dict": False}
 
     # Possibly serialize model out
     decoder_package_path = os.path.join(cache_dir, "exported_decoder.pt2")
@@ -334,7 +334,7 @@ def _example_tensor(*shape):
 
 
 def optimize(pipeline, args):
-    pipeline.set_progress_bar_config(disable=True)
+    is_timestep_distilled = args.ckpt == "black-forest-labs/FLUX.1-schnell"
 
     # fuse QKV projections in Transformer and VAE
     if not args.disable_fused_projections:
@@ -376,7 +376,9 @@ def optimize(pipeline, args):
         pipeline = use_compile(pipeline)
     elif args.compile_export_mode == "export_aoti":
         # NB: Using a cached export + AOTI model is not supported yet
-        pipeline = use_export_aoti(pipeline, cache_dir=args.cache_dir, serialize=True)
+        pipeline = use_export_aoti(
+            pipeline, cache_dir=args.cache_dir, serialize=True, is_timestep_distilled=is_timestep_distilled
+        )
     elif args.compile_export_mode == "disabled":
         pass
     else:
@@ -390,5 +392,6 @@ def optimize(pipeline, args):
 def load_pipeline(args):
     load_dtype = torch.float32 if args.disable_bf16 else torch.bfloat16
     pipeline = FluxPipeline.from_pretrained(args.ckpt, torch_dtype=load_dtype).to(args.device)
+    pipeline.set_progress_bar_config(disable=True)
     pipeline = optimize(pipeline, args)
     return pipeline