add kontext support.

sayakpaul · sayakpaul · commit b1fc6e12c685 · 2025-06-27T08:08:08.000Z
diff --git a/experiments_kontext.sh b/experiments_kontext.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+CKPT="black-forest-labs/FLUX.1-Kontext-dev"
+IMAGE="yarn-art-pikachu.png"
+PROMPT="Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors"
+CACHE_DIR="/fsx/sayak/.cache"
+
+# bfloat16
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode disabled \
+    --disable_fused_projections \
+    --disable_channels_last \
+    --disable_fa3 \
+    --disable_quant \
+    --disable_inductor_tuning_flags \
+    --output-file bf16.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16.txt 2>&1
+
+# bfloat16 + torch.compile
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode compile \
+    --disable_fused_projections \
+    --disable_channels_last \
+    --disable_fa3 \
+    --disable_quant \
+    --disable_inductor_tuning_flags \
+    --output-file bf16_compile.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16_compile.txt 2>&1
+
+# bfloat16 + torch.compile + qkv projection
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode compile \
+    --disable_channels_last \
+    --disable_fa3 \
+    --disable_quant \
+    --disable_inductor_tuning_flags \
+    --output-file bf16_compile_qkv.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16_compile_qkv.txt 2>&1
+
+# bfloat16 + torch.compile + qkv projection + channels_last
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode compile \
+    --disable_fa3 \
+    --disable_quant \
+    --disable_inductor_tuning_flags \
+    --output-file bf16_compile_qkv_chan.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16_compile_qkv_chan.txt 2>&1
+
+# bfloat16 + torch.compile + qkv projection + channels_last + FA3
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode compile \
+    --disable_quant \
+    --disable_inductor_tuning_flags \
+    --output-file bf16_compile_qkv_chan_fa3.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16_compile_qkv_chan_fa3.txt 2>&1
+
+# bfloat16 + torch.compile + qkv projection + channels_last + FA3 + float8 quant
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode compile \
+    --disable_inductor_tuning_flags \
+    --output-file bf16_compile_qkv_chan_fa3_quant.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16_compile_qkv_chan_fa3_quant.txt 2>&1
+
+# bfloat16 + torch.compile + qkv projection + channels_last + FA3 + float8 quant + inductor flags
+python run_benchmark.py \
+    --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --compile_export_mode compile \
+    --output-file bf16_compile_qkv_chan_fa3_quant_flags.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > bf16_compile_qkv_chan_fa3_quant_flags.txt 2>&1
+
+# fully optimized (torch.export + AOTI to address cold start)
+python run_benchmark.py --ckpt $CKPT --image $IMAGE --prompt "$PROMPT" \
+    --output-file fully_optimized.png \
+    --num_inference_steps 28 \
+    --cache-dir $CACHE_DIR \
+    > fully_optimized.txt 2>&1
diff --git a/run_benchmark.py b/run_benchmark.py
@@ -10,6 +10,8 @@ def _determine_pipe_call_kwargs(args):
     ckpt_id = args.ckpt
     if ckpt_id == "black-forest-labs/FLUX.1-dev":
         kwargs = {"max_sequence_length": 512, "guidance_scale": 3.5}
+    elif ckpt_id == "black-forest-labs/FLUX.1-Kontext-dev":
+        kwargs = {"max_sequence_length": 512, "guidance_scale": 2.5}
     return kwargs
 
 def set_rand_seeds(seed):
@@ -27,7 +29,7 @@ def main(args):
         image = pipeline(
             args.prompt, 
             num_inference_steps=args.num_inference_steps, 
-            generator=torch.manual_seed(0),
+            generator=torch.manual_seed(args.seed),
             **_determine_pipe_call_kwargs(args)
         ).images[0]
 
@@ -38,7 +40,7 @@ def main(args):
         image = pipeline(
             args.prompt, 
             num_inference_steps=args.num_inference_steps, 
-            generator=torch.manual_seed(0),
+            generator=torch.manual_seed(args.seed),
             **_determine_pipe_call_kwargs(args)
         ).images[0]
         end = time.time()
diff --git a/utils/benchmark_utils.py b/utils/benchmark_utils.py
@@ -10,9 +10,12 @@ def create_parser():
 
     # general options
     parser.add_argument("--ckpt", type=str, default="black-forest-labs/FLUX.1-schnell",
+                        choices=["black-forest-labs/FLUX.1-schnell", "black-forest-labs/FLUX.1-dev", 
+                                "black-forest-labs/FLUX.1-Kontext-dev"],
                         help="Model checkpoint path")
     parser.add_argument("--prompt", type=str, default="A cat playing with a ball of yarn",
                         help="Text prompt")
+    parser.add_argument("--image", type=str, default=None, help="Image to use for Kontext")
     parser.add_argument("--cache-dir", type=str, default=os.path.expandvars("$HOME/.cache/flux-fast"),
                         help="Cache directory for storing exported models")
     parser.add_argument("--use-cached-model", action="store_true",
diff --git a/utils/pipeline_utils.py b/utils/pipeline_utils.py
@@ -1,10 +1,10 @@
 import os
 import pathlib
 import torch
-import torch.nn.functional as F
-from diffusers import FluxPipeline
+from diffusers import FluxPipeline, FluxKontextPipeline
 from torch._inductor.package import load_package as inductor_load_package
-from typing import List, Optional, Tuple
+from typing import List, Optional
+from PIL import Image
 import inspect
 
 
@@ -213,6 +213,7 @@ def wrapped(*args, **kwargs):
 
 def use_compile(pipeline):
     # Compile the compute-intensive portions of the model: denoising transformer / decoder
+    is_kontext = "Kontext" in pipeline.__class__.__name__
     pipeline.transformer = torch.compile(
         pipeline.transformer, mode="max-autotune", fullgraph=True
     )
@@ -221,12 +222,13 @@ def use_compile(pipeline):
     )
 
     # warmup for a few iterations (`num_inference_steps` shouldn't matter)
+    input_kwargs = {
+        "prompt": "dummy prompt to trigger torch compilation", "num_inference_steps": 4
+    }
+    if is_kontext:
+        input_kwargs.update({"image": Image.new("RGB", size=(1024, 1024))})
     for _ in range(3):
-        pipeline(
-            "dummy prompt to trigger torch compilation",
-            output_type="pil",
-            num_inference_steps=4,
-        ).images[0]
+        pipeline(**input_kwargs).images[0]
 
     return pipeline
 
@@ -254,24 +256,28 @@ def use_export_aoti(pipeline, cache_dir, serialize=False, is_timestep_distilled=
     def _example_tensor(*shape):
         return torch.randn(*shape, device="cuda", dtype=torch.bfloat16)
 
+    # helpful flag
+    is_kontext = "Kontext" in pipeline.__class__.__name__
+
     # === Transformer compile / export ===
     seq_length = 256 if is_timestep_distilled else 512
     # these shapes are for 1024x1024 resolution.
     transformer_kwargs = {
-        "hidden_states": _example_tensor(1, 4096, 64),
+        "hidden_states": _example_tensor(1, 4096 * 2, 64) if is_kontext else _example_tensor(1, 4096, 64),
         "timestep": torch.tensor([1.], device="cuda", dtype=torch.bfloat16),
         "guidance": None if is_timestep_distilled else torch.tensor([1.], device="cuda", dtype=torch.bfloat16),
         "pooled_projections": _example_tensor(1, 768),
         "encoder_hidden_states": _example_tensor(1, seq_length, 4096),
         "txt_ids": _example_tensor(seq_length, 3),
-        "img_ids": _example_tensor(4096, 3),
+        "img_ids": _example_tensor(4096 * 2, 3) if is_kontext else _example_tensor(4096, 3),
         "joint_attention_kwargs": {},
         "return_dict": False,
     }
 
     # Possibly serialize model out
+    dev_transformer_name = "exported_kontext_dev_transformer.pt2" if is_kontext else "exported_dev_transformer.pt2"
     transformer_package_path = os.path.join(
-        cache_dir, "exported_transformer.pt2" if is_timestep_distilled else "exported_dev_transformer.pt2"
+        cache_dir, "exported_transformer.pt2" if is_timestep_distilled else dev_transformer_name
     )
     if serialize:
         # Apply export
@@ -333,12 +339,13 @@ def _example_tensor(*shape):
     pipeline.vae.decode = loaded_decoder
 
     # warmup for a few iterations
+    input_kwargs = {
+        "prompt": "dummy prompt to trigger torch compilation", "num_inference_steps": 4
+    }
+    if is_kontext:
+        input_kwargs.update({"image": Image.new("RGB", size=(1024, 1024))})
     for _ in range(3):
-        pipeline(
-            "dummy prompt to trigger torch compilation",
-            output_type="pil",
-            num_inference_steps=4,
-        ).images[0]
+        pipeline(**input_kwargs).images[0]
 
     return pipeline