[AMD][Backend] Optimize Global AtomicRMW Ops (#7496)

CRobeck · web-flow · commit df39911053ac · 2025-07-15T15:37:52.000-07:00
Following the memory model semantics described in triton-lang/triton#7292. The following code ``` @triton.jit def atomic_kernel(x_ptr BLOCK_SIZE: tl.constexpr ): pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE t1 = tl.full((BLOCK_SIZE, ), 1, dtype=tl.bfloat16) offsets = block_start + tl.arange(0, BLOCK_SIZE) tl.atomic_add(x_ptr + offsets, t1) ``` Emits a single tt.atomic_rmw op that is lowered to multiple llvm.atomic ops. For the case of global (i.e. non-buffer) atomics and the top level tt.atomic_rmw op set to acquire release semantics (the default if no sem value is explicitly set in the DSL level tl.atomic_add): ``` tt.atomic_rmw fadd, acq_rel ``` is lowered to (up to 8 for num_warps=1) ``` llvm.atomicrmw acq_rel llvm.atomicrmw acq_rel ``` with every llvm level atomic_rmw inheriting the acq_rel. This causes the llvm amdgcn backend to insert unnecessary buffer invalidates and L2 cache write backs. In the buffer atomic case this is well handled: ``` buffer_wbl2 sc1 buffer_atomic_pk_add_bf16 v1, v0, s[0:3], 0 offen buffer_atomic_pk_add_bf16 v1, v0, s[0:3], 0 offen offset:4 buffer_inv sc1 ``` However for global atomics the following GCN is emitted: ``` buffer_wbl2 sc1 global_atomic_pk_add_bf16 v0, v1, s[0:1] s_waitcnt vmcnt(0) buffer_inv sc1 buffer_wbl2 sc1 global_atomic_pk_add_bf16 v0, v1, s[0:1] offset:4 s_waitcnt vmcnt(0) buffer_inv sc1 ``` causing a 2x-8x (depending on num_warps) slow down compared to the buffer atomic case. After this PR the emitted GCN does not emit the unnecessary buffer invalidates and L2 cache write backs: ``` buffer_wbl2 sc1 global_atomic_pk_add_bf16 v0, v1, s[0:1] global_atomic_pk_add_bf16 v0, v1, s[0:1] offset:4 s_waitcnt vmcnt(0) buffer_inv sc1 ``` and the global atomic and buffer atomic performance match. Co-authored w/ @karthik-man.
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -475,3 +475,74 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
   tt.return
   }
 }
+
+// -----
+#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @atomic_kernel_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) release
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) acquire
+    %cst = arith.constant dense<true> : tensor<1024xi1, #blocked>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1024xbf16, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.addptr %arg0, %1 : !tt.ptr<bf16>, i32
+    %4 = tt.splat %3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked>
+    %5 = tt.addptr %4, %2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked>
+    %6 = tt.atomic_rmw fadd, acq_rel, gpu, %5, %cst_0, %cst : (tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xbf16, #blocked>, tensor<1024xi1, #blocked>) -> tensor<1024xbf16, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @atomic_kernel_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) release
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) acquire
+    %cst = arith.constant dense<true> : tensor<1024xi1, #blocked>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1024xbf16, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.addptr %arg0, %1 : !tt.ptr<bf16>, i32
+    %4 = tt.splat %3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked>
+    %5 = tt.addptr %4, %2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked>
+    %6 = tt.atomic_rmw fadd, acq_rel, gpu, %5, %cst_0, %cst : (tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xbf16, #blocked>, tensor<1024xi1, #blocked>) -> tensor<1024xbf16, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @atomic_kernel_fp32(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) release
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
+    // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) acquire
+    %cst = arith.constant dense<true> : tensor<1024xi1, #blocked>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    %7 = tt.atomic_rmw fadd, acq_rel, gpu, %6, %cst_0, %cst : (tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi1, #blocked>) -> tensor<1024xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.h b/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.h
@@ -22,6 +22,9 @@ class AtomicRMWEmitter {
 
   Value emitPairedAtomicForEvenTID(RewriterBase &rewriter, Value rmwPtr,
                                    Value valElem, Value rmwMask) const;
+  void setAtomicOrdering(LLVM::AtomicOrdering memOrder) {
+    this->memOrder = memOrder;
+  }
 
 private:
   const mlir::triton::AMD::TargetInfo &targetInfo;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1529,6 +1529,29 @@ struct AtomicRMWOpConversion
           valElement = vecVal;
         }
 
+        // If we have a single tl.atomic_rmw that is lowered into multiple
+        // llvm.atomic_rmw, and we set the ordering for each to aql_rel (the
+        // default if no sem value is explicitly set in the DSL level
+        // tl.atomic_add. The llvm backend will insert extra buffer invalidates
+        // and L2 write backs causing a perforance degration. To avoid this we
+        // set the ordering to release for the first, acquire for the last, and
+        // relaxed for anything in between so that only a single set of
+        // buffer_inv and buffer_wbl2 instructions are inserted by the backend
+        // for any "cluster" of atomic ops.
+        if ((vec > 1 || elemsPerThread > 1) &&
+            op.getSem() == MemSemantic::ACQUIRE_RELEASE) {
+          if (i == 0) {
+            // First
+            emitter.setAtomicOrdering(LLVM::AtomicOrdering::release);
+          } else if (i == elemsPerThread - vec) {
+            // Last
+            emitter.setAtomicOrdering(LLVM::AtomicOrdering::acquire);
+          } else {
+            // Middle
+            emitter.setAtomicOrdering(LLVM::AtomicOrdering::monotonic);
+          }
+        }
+
         Value retVal =
             emitter.emitAtomicRMW(rewriter, ptrElements[i], valElement, rmwMask,
                                   atomicSharedMemBase, enableIntraWaveReduce);
@@ -1548,6 +1571,7 @@ struct AtomicRMWOpConversion
           Value atomPtr = *atomicSharedMemBase;
           b.barrier();
           Value ret = b.load(valueElemTy, atomPtr);
+
           rewriter.replaceOp(op, {ret});
         }
       }