merge conflicts.

sayakpaul · sayakpaul · commit 91c5879d6b4d · 2025-06-11T15:41:59.000Z
diff --git a/README.md b/README.md
@@ -18,6 +18,19 @@ Summary of the optimizations:
     * `coordinate_descent_check_all_directions = True`
 * `torch.export` + Ahead-of-time Inductor (AOTI) + CUDAGraphs
 
+All of the above optimizations are lossless (outside of minor numerical differences sometimes
+introduced through the use of `torch.compile` / `torch.export`) EXCEPT FOR dynamic float8 quantization.
+Disable quantization if you want the same quality results as the baseline while still being
+quite a bit faster.
+
+Here are some example outputs for prompt `"A cat playing with a ball of yarn"`:
+
+**Baseline:**
+![baseline_output](https://github.com/user-attachments/assets/8ba746d2-fbf3-4e30-adc4-11303231c146)
+
+**Fully-optimized (with quantization):**
+![fast_output](https://github.com/user-attachments/assets/1a31dec4-38d5-45b2-8ae6-c7fb2e6413a4)
+
 ## Setup
 We rely primarily on pure PyTorch for the optimizations. Currently, a relatively recent nightly version of PyTorch is required.
 The numbers reported here were gathered using:
diff --git a/gen_image.py b/gen_image.py
@@ -0,0 +1,29 @@
+import random
+import time
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+from utils.benchmark_utils import annotate, create_parser
+from utils.pipeline_utils import load_pipeline  # noqa: E402
+
+
+def set_rand_seeds(seed):
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def main(args):
+    pipeline = load_pipeline(args)
+    set_rand_seeds(args.seed)
+
+    image = pipeline(
+        args.prompt, num_inference_steps=args.num_inference_steps, guidance_scale=0.0
+    ).images[0]
+    image.save(args.output_file)
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args = parser.parse_args()
+    # use the cached model to minimize latency
+    args.use_cached_model = True
+    main(args)
diff --git a/run_benchmark.py b/run_benchmark.py
@@ -12,8 +12,15 @@ def _determine_pipe_call_kwargs(args):
         kwargs = {"max_sequence_length": 512, "guidance_scale": 3.5}
     return kwargs
 
+def set_rand_seeds(seed):
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+
 def main(args):
+    set_rand_seeds(args.seed)
     pipeline = load_pipeline(args)
+    set_rand_seeds(args.seed)
 
     # warmup
     for _ in range(3):
@@ -66,10 +73,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-    seed = 42
-    random.seed(seed)
-    torch.manual_seed(seed)
-
     parser = create_parser()
     args = parser.parse_args()
     main(args)
diff --git a/utils/benchmark_utils.py b/utils/benchmark_utils.py
@@ -15,12 +15,16 @@ def create_parser():
                         help="Text prompt")
     parser.add_argument("--cache-dir", type=str, default=os.path.expandvars("$HOME/.cache/flux-fast"),
                         help="Cache directory for storing exported models")
+    parser.add_argument("--use-cached-model", action="store_true",
+                        help="Attempt to use cached model only (don't re-export)")
     parser.add_argument("--device", type=str, choices=["cuda", "cpu"], default="cuda",
                         help="Device to use")
     parser.add_argument("--num_inference_steps", type=int, default=4,
                         help="Number of denoising steps")
     parser.add_argument("--output-file", type=str, default="output.png",
                         help="Output image file path")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="Random seed to use")
     # file path for optional output PyTorch Profiler trace
     parser.add_argument("--trace-file", type=str, default=None,
                         help="Output PyTorch Profiler trace file path")
diff --git a/utils/pipeline_utils.py b/utils/pipeline_utils.py
@@ -387,7 +387,10 @@ def optimize(pipeline, args):
     elif args.compile_export_mode == "export_aoti":
         # NB: Using a cached export + AOTI model is not supported yet
         pipeline = use_export_aoti(
-            pipeline, cache_dir=args.cache_dir, serialize=True, is_timestep_distilled=is_timestep_distilled
+            pipeline, 
+            cache_dir=args.cache_dir, 
+            serialize=(not args.use_cached_model), 
+            is_timestep_distilled=is_timestep_distilled
         )
     elif args.compile_export_mode == "disabled":
         pass