Remove workaround in testing.py (#4846)

whitneywhtsang · web-flow · commit 0ca9cda35f99 · 2025-08-06T14:11:32.000-04:00
Since we are now using DLE 2025.1.1, we can remove the workaround.

---------

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
@@ -19,7 +19,6 @@
 import torch
 from torch.profiler import profile, ProfilerActivity, record_function
 
-import triton
 from triton.testing import assert_close as triton_assert_close, Benchmark, do_bench as triton_do_bench
 
 from triton_kernels_benchmark import build_report
@@ -93,9 +92,6 @@ def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quan
         fn()
     end_event.record()
     synchronize()
-    # FIXME: to avoid negative timings before DLE 2025.1;
-    # this workaround doesn't work for BMG.
-    triton.runtime.driver.active.utils.wait()
     estimate_ms = start_event.elapsed_time(end_event) / 5
 
     # The cache is also maintained in `triton_do_bench` function,
diff --git a/python/triton/testing.py b/python/triton/testing.py
@@ -8,65 +8,6 @@
 from typing import Any, Dict, List
 from . import language as tl
 from . import runtime
-import time
-import logging
-
-
-@functools.cache
-def _support_elapsed_time():
-    import torch
-    import triton
-
-    support = True
-    message_unsupported = "Wall time is used instead of elapsed_time (not supported). \
-        The timing measurements could be innacurate."
-
-    message_bug = "Wall time is used instead of elapsed_time because of the bug ('negative timings'). \
-        Should be fixed in DLE 2025.1. The timing measurements could be innacurate."
-
-    # FIXME: 5 iterations to detect negative timings bug; 1 should be enough for DLE 2025.1
-    for _ in range(5):
-        e1 = torch.xpu.Event(enable_timing=True)
-        e1.record()
-
-        e2 = torch.xpu.Event(enable_timing=True)
-        e2.record()
-
-        try:
-            # FIXME: to avoid negative timings before DLE 2025.1;
-            # this workaround doesn't work for BMG.
-            triton.runtime.driver.active.utils.wait()
-            if e1.elapsed_time(e2) <= 0:
-                logging.warning(message_bug)
-                support = False
-                break
-        except Exception:
-            logging.warning(message_unsupported)
-            support = False
-            break
-
-    return support
-
-
-class WallEvent():
-
-    def __init__(self, **kwargs):
-        self.record()
-
-    def record(self):
-        self.timestamp = time.perf_counter_ns() / 1_000_000
-
-    def elapsed_time(self, end):
-        return end.timestamp - self.timestamp
-
-
-def Event(**kwargs):
-    if _support_elapsed_time():
-        import torch
-
-        return torch.xpu.Event(**kwargs)
-    else:
-        return WallEvent(**kwargs)
 
 
 def nvsmi(attrs):
@@ -202,7 +143,6 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_m
     :type return_mode: str
     """
     assert return_mode in ["min", "max", "mean", "median", "all"]
-    import triton
 
     di = runtime.driver.active.get_device_interface()
 
@@ -212,30 +152,21 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_m
     cache = runtime.driver.active.get_empty_cache_for_benchmark()
 
     # Estimate the runtime of the function
-    start_event = Event(enable_timing=True)
-    end_event = Event(enable_timing=True)
-    USE_WALL_TIME = isinstance(start_event, WallEvent)
-
+    start_event = di.Event(enable_timing=True)
+    end_event = di.Event(enable_timing=True)
     start_event.record()
     for _ in range(5):
         runtime.driver.active.clear_cache(cache)
         fn()
-        if USE_WALL_TIME:
-            di.synchronize()
     end_event.record()
-    if not USE_WALL_TIME:
-        di.synchronize()
-
-    # FIXME: to avoid negative timings before DLE 2025.1;
-    # this workaround doesn't work for BMG.
-    triton.runtime.driver.active.utils.wait()
+    di.synchronize()
     estimate_ms = start_event.elapsed_time(end_event) / 5
 
     # compute number of warmup and repeat
     n_warmup = max(1, int(warmup / estimate_ms))
     n_repeat = max(1, int(rep / estimate_ms))
-    start_event = [Event(enable_timing=True) for i in range(n_repeat)]
-    end_event = [Event(enable_timing=True) for i in range(n_repeat)]
+    start_event = [di.Event(enable_timing=True) for i in range(n_repeat)]
+    end_event = [di.Event(enable_timing=True) for i in range(n_repeat)]
     # Warm-up
     for _ in range(n_warmup):
         fn()
@@ -249,21 +180,12 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_m
                 x.grad = None
         # we clear the L2 cache before each run
         runtime.driver.active.clear_cache(cache)
-        if USE_WALL_TIME:
-            di.synchronize()
         # record time of `fn`
         start_event[i].record()
         fn()
-        if USE_WALL_TIME:
-            di.synchronize()
         end_event[i].record()
     # Record clocks
-    if not USE_WALL_TIME:
-        di.synchronize()
-
-    # FIXME: to avoid negative timings before DLE 2025.1;
-    # this workaround doesn't work for BMG.
-    triton.runtime.driver.active.utils.wait()
+    di.synchronize()
     times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
     return _summarize_statistics(times, quantiles, return_mode)