Make acc matrix allocation on each call for XeTLA GEMM benchmarks (#3026)

anmyachev · web-flow · commit 7eb41bf239d5 · 2024-12-18T16:44:29.000+01:00
If we take for comparison: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/12382880184/job/34564504020 (main) vs https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/12390456716/job/34585505155 (PR), then the degradation from this pull request for XeTLA is ~3%. However, this is also a **potential opportunity** to improve the Triton kernel by only allocating the accumulation matrix once. If this is implemented for Triton, this pull request will need to be rolled back for XeTLA. --------- Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py
@@ -306,19 +306,27 @@ def benchmark(B, M, N, K, provider):
     elif provider == 'xetla':
         if B == 1:
             c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
-            acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
             cnt = torch.zeros((M, N), device='xpu', dtype=torch.int32)
         else:
             c = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
-            acc = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
             cnt = torch.zeros((B, M, N), device='xpu', dtype=torch.int32)
         name = f'gemm_shape_{B}_{M}_{K}_{N}'
         # FIXME: Use gemm_streamk_benchmark.py when Triton streamk can get
         # better performance.
         if (B, M, N, K) == (1, 3072, 3072, 4096):
             name = 'gemm_streamk_shape_3072_4096_3072'
         func = getattr(xetla_kernel, name)
-        xetla_fn = lambda: func(a, b, c, acc, cnt)
+
+        def xetla_func_with_acc_allocation():
+            # allocating `acc` matrix on every function call, to be as similar as
+            # possible to the triton kernel, which also does this on every call.
+            if B == 1:
+                acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+            else:
+                acc = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
+            return func(a, b, c, acc, cnt)
+
+        xetla_fn = xetla_func_with_acc_allocation
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
 
         kernels_name = {