Run CUDA synchronize after each _do_bench call, to surface error sooner (#544)

yf225 · web-flow · commit 905b15277b70 · 2025-10-10T19:13:45.000-04:00
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
@@ -1031,13 +1031,8 @@ def run(
                 # before we hand them to the captured graph. Otherwise we can
                 # read partially initialized values (e.g. from torch.randint)
                 # and hit device-side asserts in the baseline kernels.
-                if (
-                    self.use_cuda_graphs
-                    and self.device
-                    and self.device.startswith("cuda")
-                    and torch.cuda.is_available()
-                ):
-                    torch.cuda.synchronize()
+                if self.use_cuda_graphs:
+                    torch.accelerator.synchronize()
                 self.baseline_fn = None
                 self.baseline_metrics = None
                 self._op_flops = {}
@@ -1108,6 +1103,8 @@ def _reduce_benchmarks(acc, bm_name: str):
                         quantiles=quantiles,
                         baseline=baseline,
                     )
+                    # Synchronize after each benchmark to make errors surface sooner
+                    torch.accelerator.synchronize()
                     if baseline:
                         self.baseline_metrics = acc[bm_name]
                     if sleep: