Enable launcher benchmarks in CI for XPU

HBN-MichalSzy · HBN-MichalSzy · commit e1773dd6867a · 2025-11-27T16:41:29.000Z
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
@@ -373,6 +373,11 @@ jobs:
         run: |
           timeout -s KILL 4900 ${{ env.TRITON_TEST_CMD }} --inductor || ${{ inputs.ignore_errors }}
 
+      - name: Run microbenchmark tests
+        if: matrix.suite == 'rest'
+        run: |
+          python3 python/test/microbenchmark/launch_overhead.py
+
       - name: Save pip cache
         if: ${{ steps.pip-cache.outputs.status == 'miss' }}
         uses: ./.github/actions/save
diff --git a/python/test/microbenchmark/launch_overhead.py b/python/test/microbenchmark/launch_overhead.py
@@ -42,11 +42,11 @@ def nop_args(
 def do_bench_walltime(fn):
     print("Compiling...")
     fn()
-    torch.cuda.synchronize()
+    torch.xpu.synchronize()
 
     for _ in range(1000):
         fn()
-    torch.cuda.synchronize()
+    torch.xpu.synchronize()
 
     n_repeat = 10000
 
@@ -55,11 +55,11 @@ def do_bench_walltime(fn):
     for _ in range(25):
         print("Running %d benchmarking iterations..." % n_repeat)
         # Benchmark
-        torch.cuda.synchronize()
+        torch.xpu.synchronize()
         start_time = time.time()
         for _ in range(n_repeat):
             fn()
-        torch.cuda.synchronize()
+        torch.xpu.synchronize()
         end_time = time.time()
         wall_time_ms = (end_time - start_time) * 1e3 / n_repeat
         mses.append(wall_time_ms)
@@ -71,7 +71,7 @@ def do_bench_walltime(fn):
     profile.enable()
     for _ in range(n_repeat):
         fn()
-    torch.cuda.synchronize()
+    torch.xpu.synchronize()
     profile.disable()
     stats = pstats.Stats(profile)
     stats.sort_stats("time")
@@ -81,9 +81,9 @@ def do_bench_walltime(fn):
 
 def main(use_tensor_desc: bool):
     if use_tensor_desc:
-        targs = [TensorDescriptor.from_tensor(torch.zeros(1, 16, device="cuda"), block_shape=[1, 16]) for _ in range(5)]
+        targs = [TensorDescriptor.from_tensor(torch.zeros(1, 16, device="xpu"), block_shape=[1, 16]) for _ in range(5)]
     else:
-        targs = [torch.zeros(1, device="cuda") for _ in range(5)]
+        targs = [torch.zeros(1, device="xpu") for _ in range(5)]
     ncargs = [0, 1, 1024, 2**31 - 1, 2**64 - 1, False, True, None, (16, 16)]
     cargs = [32, False, True, 0, 64]