fixes

sayakpaul · sayakpaul · commit cc0a38a22542 · 2025-05-15T21:05:11.000+05:30
diff --git a/benchmarks/benchmarking_flux.py b/benchmarks/benchmarking_flux.py
@@ -7,7 +7,7 @@
 
 class BenchmarkFlux(BenchmarkMixin):
     model_class = FluxTransformer2DModel
-    compile_kwargs = {"fullgraph": True, "mode": "max-autotune"}
+    compile_kwargs = {"fullgraph": True}
 
     def get_model_init_dict(self):
         return {
@@ -29,8 +29,8 @@ def get_input_dict(self):
         pooled_prompt_embeds = torch.randn(1, 768, device=torch_device, dtype=torch.bfloat16)
         image_ids = torch.ones(512, 3, device=torch_device, dtype=torch.bfloat16)
         text_ids = torch.ones(4096, 3, device=torch_device, dtype=torch.bfloat16)
-        timestep = torch.tensor([1.0], device=torch_device)
-        guidance = torch.tensor([1.0], device=torch_device)
+        timestep = torch.tensor([1.0], device=torch_device, dtype=torch.bfloat16)
+        guidance = torch.tensor([1.0], device=torch_device, dtype=torch.bfloat16)
 
         return {
             "hidden_states": hidden_states,
diff --git a/benchmarks/benchmarking_utils.py b/benchmarks/benchmarking_utils.py
@@ -11,7 +11,7 @@ def benchmark_fn(f, *args, **kwargs):
     t0 = benchmark.Timer(
         stmt="f(*args, **kwargs)",
         globals={"args": args, "kwargs": kwargs, "f": f},
-        num_threads=torch.get_num_threads(),
+        num_threads=1,
     )
     return f"{(t0.blocked_autorange().mean):.3f}"
 
@@ -53,10 +53,6 @@ def run_benchmark(self):
         model = self.initialize_model()  # Takes care of device placement.
         input_dict = self.get_input_dict()  # Takes care of device placement.
 
-        # warmup
-        for _ in range(5):
-            _ = model(**input_dict)
-
         time = benchmark_fn(lambda model, input_dict: model(**input_dict), model, input_dict)
         memory = torch.cuda.max_memory_allocated() / (1024**3)
         memory = float(f"{memory:.2f}")
@@ -69,9 +65,9 @@ def run_benchmark(self):
         compile_stats = None
         if self.compile_kwargs is not None:
             model = self.initialize_model()
-            with torch._inductor.utils.fresh_inductor_cache():
-                model.compile(**self.compile_kwargs)
-                time = benchmark_fn(lambda model, input_dict: model(**input_dict), model, input_dict)
+            input_dict = self.get_input_dict()
+            model.compile(**self.compile_kwargs)
+            time = benchmark_fn(lambda model, input_dict: model(**input_dict), model, input_dict)
             memory = torch.cuda.max_memory_allocated() / (1024**3)
             memory = float(f"{memory:.2f}")
             compile_stats = {"time": time, "memory": memory}