checking.

sayakpaul · sayakpaul · commit 169f831cf33f · 2025-05-15T21:20:13.000+05:30
diff --git a/benchmarks/benchmarking_utils.py b/benchmarks/benchmarking_utils.py
@@ -1,4 +1,6 @@
 import gc
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -13,7 +15,7 @@ def benchmark_fn(f, *args, **kwargs):
         globals={"args": args, "kwargs": kwargs, "f": f},
         num_threads=1,
     )
-    return f"{(t0.blocked_autorange().mean):.3f}"
+    return float(f"{(t0.blocked_autorange().mean):.3f}")
 
 
 def flush():
@@ -23,11 +25,18 @@ def flush():
     torch.cuda.reset_peak_memory_stats()
 
 
+@dataclass
+class BenchmarkScenario:
+    name: str
+    model_cls: ModelMixin
+    model_init_kwargs: Dict[str, Any]
+    model_init_fn: Callable
+    get_model_input_dict: Callable[[], Dict[str, Any]]
+    compile_kwargs: Optional[Dict[str, Any]] = None
+
+
 @require_torch_gpu
 class BenchmarkMixin:
-    model_class: ModelMixin = None
-    compile_kwargs: dict = None
-
     def get_model_init_dict(self):
         raise NotImplementedError
 
@@ -47,31 +56,61 @@ def post_benchmark(self, model):
         torch.compiler.reset()
 
     @torch.no_grad()
-    def run_benchmark(self):
+    def run_benchmark(self, scenario: BenchmarkScenario):
+        # 1) plain stats
+        plain = self._run_phase(
+            init_fn=scenario.model_init_fn,
+            init_kwargs=scenario.model_init_kwargs,
+            get_input_fn=scenario.get_model_input_dict,
+            compile_kwargs=None,
+        )
+
+        # 2) compiled stats (if any)
+        compiled = None
+        if scenario.compile_kwargs:
+            compiled = self._run_phase(
+                init_fn=scenario.model_init_fn,
+                init_kwargs=scenario.model_init_kwargs,
+                get_input_fn=scenario.get_model_input_dict,
+                compile_kwargs=scenario.compile_kwargs,
+            )
+
+        # 3) merge
+        result = {"scenario": scenario.name, "time_plain_s": plain["time"], "mem_plain_GB": plain["memory"]}
+        if compiled:
+            result.update(
+                {
+                    "time_compile_s": compiled["time"],
+                    "mem_compile_GB": compiled["memory"],
+                }
+            )
+        return result
+
+    def _run_phase(
+        self,
+        *,
+        init_fn: Callable[..., Any],
+        init_kwargs: Dict[str, Any],
+        get_input_fn: Callable[[], Dict[str, torch.Tensor]],
+        compile_kwargs: Optional[Dict[str, Any]],
+    ) -> Dict[str, float]:
+        # setup
         self.pre_benchmark()
 
-        model = self.initialize_model()  # Takes care of device placement.
-        input_dict = self.get_input_dict()  # Takes care of device placement.
-
-        time = benchmark_fn(lambda model, input_dict: model(**input_dict), model, input_dict)
-        memory = torch.cuda.max_memory_allocated() / (1024**3)
-        memory = float(f"{memory:.2f}")
-        non_compile_stats = {"time": time, "memory": memory}
+        # init & (optional) compile
+        model = init_fn(**init_kwargs)
+        if compile_kwargs:
+            model.compile(**compile_kwargs)
 
-        self.post_benchmark(model)
-        del model
-        self.pre_benchmark()
+        # build inputs
+        inp = get_input_fn()
 
-        compile_stats = None
-        if self.compile_kwargs is not None:
-            model = self.initialize_model()
-            input_dict = self.get_input_dict()
-            model.compile(**self.compile_kwargs)
-            time = benchmark_fn(lambda model, input_dict: model(**input_dict), model, input_dict)
-            memory = torch.cuda.max_memory_allocated() / (1024**3)
-            memory = float(f"{memory:.2f}")
-            compile_stats = {"time": time, "memory": memory}
+        # measure
+        time_s = benchmark_fn(lambda m, d: m(**d), model, inp)
+        mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
+        mem_gb = round(mem_gb, 2)
 
+        # teardown
         self.post_benchmark(model)
         del model
-        return non_compile_stats, compile_stats
+        return {"time": time_s, "memory": mem_gb}