Apply relaxed mod to streamk splitk atomic op (#3706)

leonling-ll · web-flow · commit d61d915745ab · 2025-03-18T07:15:39.000-04:00
diff --git a/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py
@@ -61,7 +61,7 @@ def _kernel(A, B, C,  #
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
         C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
         mask = (rm < M)[:, None] & (rn < N)[None, :]
-        tl.atomic_add(C, acc, mask=mask)
+        tl.atomic_add(C, acc, mask=mask, sem='relaxed')
 
 
 class _matmul(torch.autograd.Function):
diff --git a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py
@@ -94,7 +94,7 @@ def mac_loop(
         rn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
         c_ptr_ = c_ptr + rm[:, None] * stride_cm + rn[None, :] * stride_cn
         mask = (rm < M)[:, None] & (rn < N)[None, :]
-        tl.atomic_add(c_ptr_, acc, mask=mask)
+        tl.atomic_add(c_ptr_, acc, mask=mask, sem='relaxed')
 
 
 @triton.autotune(