readme and stanalone script.

sayakpaul · sayakpaul · commit 0a3b7a1bf73a · 2025-06-11T09:24:18.000+05:30
diff --git a/README.md b/README.md
@@ -37,6 +37,38 @@ To install flash attention v3, follow the instructions in https://github.com/Dao
 
 For hardware, we used a 96GB 700W H100 GPU. Some of the optimizations applied (BFloat16, torch.compile, Combining q,k,v projections, dynamic float8 quantization) are available on CPU as well.
 
+## Run the optimized pipeline
+
+```sh
+python optimized_flux_inference.py
+```
+
+This will use Flux Schnell and will also use the AOT serialized binaries. If the binaries don't exist, they will be
+automatically downloaded from [here](https://hf.co/jbschlosser/flux-fast).
+
+Usage:
+
+```usage: optimized_flux_inference.py [-h] [--cache_dir CACHE_DIR] [--ckpt CKPT] [--prompt PROMPT]
+                                   [--num_inference_steps NUM_INFERENCE_STEPS] [--guidance_scale GUIDANCE_SCALE] [--seed SEED]
+                                   [--output_file OUTPUT_FILE]
+
+options:
+  -h, --help            show this help message and exit
+  --cache_dir CACHE_DIR
+                        Directory where we should expect to fine the AOT exported artifacts as well as the model params.
+  --ckpt CKPT
+  --prompt PROMPT
+  --num_inference_steps NUM_INFERENCE_STEPS
+  --guidance_scale GUIDANCE_SCALE
+                        Ignored when using Schnell.
+  --seed SEED
+  --output_file OUTPUT_FILE
+                        Output image file path
+```
+
+> [!IMPORTANT]
+> The binaries won't work for hardware that are different from the ones they were obtained on. For example, if the binaries were obtained on an H100, they won't work on A100.
+
 ## Benchmarking
 [`run_benchmark.py`](./run_benchmark.py) is the main script for benchmarking the different optimization techniques.
 Usage:
diff --git a/optimized_flux_inference.py b/optimized_flux_inference.py
@@ -0,0 +1,59 @@
+import argparse
+from diffusers import FluxPipeline
+import torch
+import os
+from utils.pipeline_utils import load_package
+
+
+@torch.no_grad()
+def load_pipeline(args):
+    pipeline = FluxPipeline.from_pretrained(args.ckpt, torch_dtype=torch.bfloat16, cache_dir=args.cache_dir).to("cuda")
+
+    is_timestep_distilled = not pipeline.transformer.config.guidance_embeds
+
+    transformer_package_path = os.path.join(
+        args.cache_dir, "exported_transformer.pt2" if is_timestep_distilled else "exported_dev_transformer.pt"
+    )
+    decoder_package_path = os.path.join(
+        args.cache_dir, "exported_decoder.pt2" if is_timestep_distilled else "exported_dev_decoder.pt"
+    )
+    loaded_transformer = load_package(transformer_package_path)
+    loaded_decoder = load_package(decoder_package_path)
+    pipeline.transformer.forward = loaded_transformer
+    pipeline.vae.decode = loaded_decoder
+
+    return pipeline
+
+
+def create_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=os.path.expandvars("$HOME/.cache/flux-fast"),
+        help="Directory where we should expect to fine the AOT exported artifacts as well as the model params.",
+    )
+    parser.add_argument("--ckpt", type=str, default="black-forest-labs/FLUX.1-schnell")
+    parser.add_argument("--prompt", type=str, default="A cat playing with a ball of yarn")
+    parser.add_argument("--num_inference_steps", type=int, default=4)
+    parser.add_argument("--guidance_scale", type=float, default=3.5, help="Ignored when using Schnell.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--output_file", type=str, default="output.png", help="Output image file path")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_arg_parser()
+    args = parser.parse_args()
+    pipeline = load_pipeline(args)
+
+    is_timestep_distilled = not pipeline.transformer.config.guidance_embeds
+    image = pipeline(
+        prompt=args.prompt,
+        num_inference_steps=args.num_inference_steps,
+        max_sequence_length=256 if is_timestep_distilled else 512,
+        guidance_scale=None if is_timestep_distilled else args.guidance_scale,
+        generator=torch.manual_seed(args.seed),
+    ).images[0]
+    image.save(args.output_file)
+    print(f"Image serialized to {args.output_file}")
diff --git a/utils/pipeline_utils.py b/utils/pipeline_utils.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 from diffusers import FluxPipeline
-from torch._inductor.package import load_package
+from torch._inductor.package import load_package as inductor_load_package
 from typing import List, Optional, Tuple
 
 
@@ -233,6 +233,14 @@ def download_hosted_file(filename, output_path):
     hf_hub_download(REPO_NAME, filename, local_dir=os.path.dirname(output_path))
 
 
+def load_package(package_path):
+    if not os.path.exists(package_path):
+        download_hosted_file(os.path.basename(package_path), package_path)
+
+    loaded_package = inductor_load_package(package_path, run_single_threaded=True)
+    return loaded_package
+
+
 def use_export_aoti(pipeline, cache_dir, serialize=False, is_timestep_distilled=True):
     # create cache dir if needed
     pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True)
@@ -270,12 +278,7 @@ def _example_tensor(*shape):
             inductor_configs={"max_autotune": True, "triton.cudagraphs": True},
         )
     # download serialized model if needed
-    if not os.path.exists(transformer_package_path):
-        download_hosted_file(os.path.basename(transformer_package_path), transformer_package_path)
-
-    loaded_transformer = load_package(
-        transformer_package_path, run_single_threaded=True
-    )
+    loaded_transformer = load_package(transformer_package_path)
 
     # warmup before cudagraphing
     with torch.no_grad():
@@ -310,10 +313,7 @@ def _example_tensor(*shape):
             inductor_configs={"max_autotune": True, "triton.cudagraphs": True},
         )
     # download serialized model if needed
-    if not os.path.exists(decoder_package_path):
-        download_hosted_file(os.path.basename(decoder_package_path), decoder_package_path)
-
-    loaded_decoder = load_package(decoder_package_path, run_single_threaded=True)
+    loaded_decoder = load_package(decoder_package_path)
 
     # warmup before cudagraphing
     with torch.no_grad():
@@ -334,7 +334,7 @@ def _example_tensor(*shape):
 
 
 def optimize(pipeline, args):
-    is_timestep_distilled = args.ckpt == "black-forest-labs/FLUX.1-schnell"
+    is_timestep_distilled = not pipeline.transformer.config.guidance_embeds
 
     # fuse QKV projections in Transformer and VAE
     if not args.disable_fused_projections: