[AMD] Add support for Buffer Atomic CAS (#7292)

karthik-man · web-flow · commit 2edb2e7c9a76 · 2025-07-17T16:51:11.000Z
This PR adds support for Buffer Atomic CAS conversion.

This is mostly based on PR #5549 with the following differences :
1. Changes to handle differences in arguments between tl.atomic_cas and
tl.atomic_&lt;rmw op&gt;
2. BUFFER_ATOMIC_CMPSWAP supports fewer dtypes than BUFFER_ATOMIC_XX
3. "s_waitcnt vmcnt(0)" are not emitted between buffer_atomic
instructions that are lowered from the same tl.atomic_cas. The s_waitcnt
is not necessary for relaxed ordering for any scope. For the agent-scope
rel/acq/ac_rel cases, the s_waitcnt vmcnt(0) seems to only be required
before/after the sequence of buffer_atomic instructions that are lowered
from the same tl.atomic_. The preceding/succeeding FenceOp will emit the
necessary s_waitcnt vmcnt(0) and L2 inv/writeback instructions. See
comments for more details.
diff --git a/test/Conversion/amd/buffer_atomic_cas.mlir b/test/Conversion/amd/buffer_atomic_cas.mlir
@@ -0,0 +1,39 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck %s
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: buffer_atomic_cas_i64
+  tt.func public @buffer_atomic_cas_i64(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+    // CHECK: %[[cas_val:.*]] = llvm.mlir.constant(2 : i64) : i64
+    // CHECK: %[[cas_val_cast:.*]] = llvm.bitcast %[[cas_val]] : i64 to i64
+    // CHECK: %[[cas_val_insert:.*]] = llvm.insertvalue %[[cas_val_cast]], %{{.*}}[1] : !llvm.struct<(i64, i64)>
+    %val = arith.constant dense<2> : tensor<512xi64, #blocked>
+
+    // CHECK: %[[cas_cmp:.*]] = llvm.mlir.constant(0 : i64) : i64
+    // CHECK: %[[cas_cmp_cast:.*]] = llvm.bitcast %[[cas_cmp]] : i64 to i64
+    // CHECK: %[[cas_cmp_insert:.*]] = llvm.insertvalue %[[cas_cmp_cast]], %{{.*}}[1] : !llvm.struct<(i64, i64)>
+    %cmp = arith.constant dense<0> : tensor<512xi64, #blocked>
+
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %offsets = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %scalar_ptr = tt.addptr %arg0, %1 : !tt.ptr<i64>, i32
+
+    // CHECK: %[[cas_val_extract:.*]] = llvm.extractvalue %[[cas_val_insert]][0] : !llvm.struct<(i64, i64)>
+    // CHECK: %[[cas_cmp_extract:.*]] = llvm.extractvalue %[[cas_cmp_insert]][0] : !llvm.struct<(i64, i64)>
+    // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
+    // CHECK: llvm.fence syncscope("agent") release
+    // CHECK: %[[cas_val_insert2:.*]] = llvm.insertelement %[[cas_val_extract]], %{{.*}} : vector<1xi64>
+    // CHECK: %[[cas_cmp_insert2:.*]] = llvm.insertelement %[[cas_cmp_extract]], %{{.*}} : vector<1xi64>
+    // CHECK: %[[cas_val_cast2:.*]] = llvm.bitcast %[[cas_val_insert2]] : vector<1xi64> to i64
+    // CHECK: %[[cas_cmp_cast2:.*]] = llvm.bitcast %[[cas_cmp_insert2]] : vector<1xi64> to i64
+    // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[cas_val_cast2]], %[[cas_cmp_cast2]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
+    // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %{{.*}}, %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
+    // CHECK: llvm.fence syncscope("agent") acquire
+    %4 = amdgpu.buffer_atomic_cas acq_rel, gpu, %cmp, %val, %scalar_ptr[%offsets] : tensor<512xi64, #blocked>
+
+    %5 = tt.addptr %arg1, %1 : !tt.ptr<i64>, i32
+    amdgpu.buffer_store %4, %5[%offsets] : tensor<512xi64, #blocked>
+    tt.return
+  }
+}
diff --git a/test/Conversion/amd/buffer_load_store.mlir b/test/Conversion/amd/buffer_load_store.mlir
@@ -223,15 +223,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         // CHECK: %[[mask2:.*]] = llvm.and %[[mask1]], %[[mask0]]
         // CHECK: %[[offset:.*]] = llvm.select %[[mask2]]
 
-        // We will have 4 calls to fadd, since the sizePerThread is 4. We should have a vmcnt between each call.
+        // We will have 4 calls to fadd, since the sizePerThread is 4. Scope/ordering instructions will be
+        // generated by the lowering of llvm.fence
         %ret = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %values, %arg0[%offset], %mask stride = %stride : tensor<128xf32, #blocked0>
 
         // CHECK: %[[result:.*]] = llvm.call_intrinsic "llvm.amdgcn.raw.ptr.buffer.atomic.fadd"({{.*}}, {{.*}}, %[[mask1:.*]], {{.*}}, {{.*}}) : (f32, !llvm.ptr<8>, i32, i32, i32) -> f32
-        // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "s_waitcnt vmcnt(0) ", ""  : () -> !llvm.void
         // CHECK: %[[result:.*]] = llvm.call_intrinsic "llvm.amdgcn.raw.ptr.buffer.atomic.fadd"({{.*}}, {{.*}}, %[[mask1:.*]], {{.*}}, {{.*}}) : (f32, !llvm.ptr<8>, i32, i32, i32) -> f32
-        // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "s_waitcnt vmcnt(0) ", ""  : () -> !llvm.void
         // CHECK: %[[result:.*]] = llvm.call_intrinsic "llvm.amdgcn.raw.ptr.buffer.atomic.fadd"({{.*}}, {{.*}}, %[[mask1:.*]], {{.*}}, {{.*}}) : (f32, !llvm.ptr<8>, i32, i32, i32) -> f32
-        // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "s_waitcnt vmcnt(0) ", ""  : () -> !llvm.void
         // CHECK: %[[result:.*]] = llvm.call_intrinsic "llvm.amdgcn.raw.ptr.buffer.atomic.fadd"({{.*}}, {{.*}}, %[[mask1:.*]], {{.*}}, {{.*}}) : (f32, !llvm.ptr<8>, i32, i32, i32) -> f32
 
         // There should be a single acquire fence after all of the atomics
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
@@ -675,3 +675,32 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
 // CHECK:    %[[VAR_3:.*]] = amdgpu.buffer_load %[[ARG_0]][%[[VAR_2]]] : tensor<128x256xf32, #blocked>
 // CHECK:    tt.return %[[VAR_3]] : tensor<128x256xf32, #blocked>
 // CHECK:  }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: buffer_atomic_cas_i64
+  tt.func public @buffer_atomic_cas_i64(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32} , %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
+    // CHECK: %[[val:.*]] = arith.constant dense<2>
+    %cst = arith.constant dense<2> : tensor<1024xi64, #blocked>
+    // CHECK: %[[cmp:.*]] = arith.constant dense<0>
+    %cst_0 = arith.constant dense<0> : tensor<1024xi64, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    // CHECK: %[[offset:.*]] = tt.make_range
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    // CHECK: %[[scalar_ptr:.*]] = tt.addptr %arg0
+    %3 = tt.addptr %arg0, %1 : !tt.ptr<i64>, i32
+    %4 = tt.splat %3 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>, #blocked>
+    %5 = tt.addptr %4, %2 : tensor<1024x!tt.ptr<i64>, #blocked>, tensor<1024xi32, #blocked>
+    // CHECK: amdgpu.buffer_atomic_cas acq_rel, gpu, %[[cmp]], %[[val]], %[[scalar_ptr]][%[[offset]]]
+    %6 = tt.atomic_cas acq_rel, gpu, %5, %cst_0, %cst : (tensor<1024x!tt.ptr<i64>, #blocked>, tensor<1024xi64, #blocked>, tensor<1024xi64, #blocked>) -> tensor<1024xi64, #blocked>
+    %7 = tt.addptr %arg1, %1 : !tt.ptr<i64>, i32
+    %8 = tt.splat %7 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>, #blocked>
+    %9 = tt.addptr %8, %2 : tensor<1024x!tt.ptr<i64>, #blocked>, tensor<1024xi32, #blocked>
+    tt.store %9, %6 : tensor<1024x!tt.ptr<i64>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -410,6 +410,46 @@ def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// BufferAtomicCASOp
+//===----------------------------------------------------------------------===//
+def BufferAtomicCASOp : TT_AMDGPU_Op<"buffer_atomic_cas", [
+  SameLoadStoreOperandsAndResultEncoding,
+  TypesMatchWith<"result element type matches the val type", "result", "val", "$_self">,
+  TypesMatchWith<"result element type matches the cmp type", "result", "cmp", "$_self">,
+  TypesMatchWith<"result element type matches the pointed type of ptr", "result", "ptr", "getPointerTypeToElement($_self)">,
+  TypesMatchWith<"result and offsets have the same shape", "result", "offsets", "getI32SameShape($_self)">,
+  TypesMatchWith<"val and offsets have the same shape", "val", "offsets", "getI32SameShape($_self)">,
+  TypesMatchWith<"val and cmp have the same shape", "val", "cmp", "$_self">,
+]>{
+    let summary = "Atomic CAS op which does compare-exchange to a scalar base pointer and a tensor offset";
+    let description = [{
+        AMD Buffer Atomic CAS operation. Buffer atomics are similar to normal atomics, but access global memory via a
+        scalar base pointer and a tensor of offsets instead of a tensor of pointers.
+        Similar to TT_AtomicCASOp: Buffer atomic CAS op loads data at $ptr, and stores $val to $ptr atomically if value at $ptr equals $cmp, with
+        the specified memory semantics and scope. Atomic CAS ops return the pre-op value if used, otherwise the value is implicitly dropped.
+        Stride is the distance between the beginning of contiguous memory chunks. When performing a CAS, the `stride` is
+        the address difference between the first elements of each row in bytes. Compiler tries to obtain the `stride`
+        when it converts to the buffer ops because it is important for optimizing the cache memory access.
+    }];
+    let arguments = (ins
+      Arg<TT_Ptr, "Global memory pointer", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$ptr,
+      I32Tensor:$offsets,
+      TT_Tensor:$cmp,
+      TT_Tensor:$val,
+      Optional<I32>:$stride,
+      TT_MemSemanticAttr:$sem,
+      TT_MemSyncScopeAttr:$scope
+    );
+    let results = (outs TT_Tensor:$result);
+
+    let assemblyFormat = [{
+        $sem `,` $scope `,` $cmp `,` $val `,` $ptr `[` $offsets `]`
+        (`stride` `=` $stride^)?
+        attr-dict `:` type($result)
+    }];
+}
+
 //===----------------------------------------------------------------------===//
 // BufferStoreOp
 //===----------------------------------------------------------------------===//
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp
@@ -127,6 +127,30 @@ BufferEmitter::emitLoadToLds(Type type, Value byteWidth, Value rsrcDesc,
       ArrayRef<NamedAttribute>());
 }
 
+Value BufferEmitter::emitAtomicCAS(Type type, Value rsrcDesc, Value offset,
+                                   Value casCmpVal, Value casStoreVal,
+                                   Value pred, bool hasUsers) {
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  VectorType storeVecTy = cast<VectorType>(casStoreVal.getType());
+  VectorType cmpVecTy = cast<VectorType>(casCmpVal.getType());
+  Type bufferType = getBufferOpType(type, true);
+  if (storeVecTy != bufferType)
+    casStoreVal = b.bitcast(casStoreVal, bufferType);
+  if (cmpVecTy != bufferType)
+    casCmpVal = b.bitcast(casCmpVal, bufferType);
+  // Note: rocdl.raw.ptr.buffer.atomic.cmpswap expects
+  // val to be before cmp in the arg list. This is
+  // the opposite of the order in tl.atomic_cmpxchg
+  // and amdgpu.buffer_atomic_cas
+  SmallVector<Value, 6> args{casStoreVal, casCmpVal};
+  fillCommonArgsAtomics(type, rsrcDesc, offset, pred, hasUsers, args);
+
+  Value data = rewriter.create<ROCDL::RawPtrBufferAtomicCmpSwap>(
+      loc, bufferType, args, ArrayRef<NamedAttribute>());
+  data = b.bitcast(data, type);
+  return data;
+}
+
 Value BufferEmitter::emitAtomicRMW(RMWOp rmwType, Type type, Value rsrcDesc,
                                    Value offset, Value data, Value pred,
                                    bool hasUsers) {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.h b/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.h
@@ -80,6 +80,10 @@ struct BufferEmitter {
   Value emitAtomicRMW(RMWOp rmwType, Type type, Value rsrcDesc, Value offset,
                       Value data, Value pred, bool hasUsers);
 
+  // Emit a predicated rocdl.raw.ptr.buffer.atomic.cmpswap
+  Value emitAtomicCAS(Type type, Value rsrcDesc, Value offset, Value casCmpVal,
+                      Value casStoreVal, Value pred, bool hasUsers);
+
   // Emit a predicated rocdl.raw.ptr.buffer.store
   void emitStore(Value rsrcDesc, Value offset, Value data, Value pred,
                  CacheModifier cm);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp