[AMD] Remove bfloat16 fadd buffer atomic for gfx942 (#7011)

karthik-man · web-flow · commit 5e00f356625b · 2025-06-03T13:13:50.000-07:00
The only supported 16-bit buffer atomic fadd dtype on gfx942 is float16
(BUFFER_ATOMIC_PK_ADD_F16). There seems to be no corresponding
instruction for bf16. Without this PR, the following kernel

```
import torch
import triton
import triton.language as tl

@triton.jit
def atomic_add_bf16(X, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    t1 = tl.full((BLOCK_SIZE, ), 1, dtype=tl.bfloat16)
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    tl.atomic_add(X + offsets, t1)

X = torch.tensor([0, 1] * 256, device='cuda', dtype=torch.bfloat16)
Z = torch.tensor([1, 2] * 256, device='cuda', dtype=torch.bfloat16)
k = atomic_add_bf16[(1,)](X, 2 * 256)
assert (torch.equal(X, Z))
print("Success!")
```
will fail in instruction selection

&gt;~/triton]$ python ~/kernels/atomic_add_bf16.py 
&gt; LLVM ERROR: Cannot select: t56: v2bf16,ch =
BUFFER_ATOMIC_FADD&lt;(volatile dereferenceable load store (s32) on %ir.10,
align 1, addrspace 8)&gt; # D:1 t34, t37, t49, Constant:i32&lt;0&gt;, t20,
Constant:i32&lt;0&gt;, TargetConstant:i32&lt;0&gt;, TargetConstant:i32&lt;0&gt;,
TargetConstant:i1&lt;0&gt;, atomic_add_bf16.py:11:31
&gt;..
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
@@ -546,6 +546,27 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: atomic_add_bf16
+  tt.func public @atomic_add_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
+    %cst = arith.constant dense<true> : tensor<512xi1, #blocked>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<512xbf16, #blocked>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.addptr %arg0, %1 : !tt.ptr<bf16>, i32
+    %4 = tt.splat %3 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked>
+    %5 = tt.addptr %4, %2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked>
+    // CHECK-NOT: amdgpu.buffer_atomic_rmw
+    %6 = tt.atomic_rmw fadd, acq_rel, gpu, %5, %cst_0, %cst : (tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xbf16, #blocked>, tensor<512xi1, #blocked>) -> tensor<512xbf16, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: assume_positive_offset_buffer_atomic
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
@@ -262,10 +262,10 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW
       mlir::MLIRContext *context,
       DenseMap<Value, SetVector<Operation *>> &assumptions,
       ModuleAxisInfoAnalysis &axisAnalysisPass,
-      std::shared_ptr<DataFlowSolver> solver)
+      std::shared_ptr<DataFlowSolver> solver, ISAFamily isaFamily)
       : mlir::OpRewritePattern<triton::AtomicRMWOp>(context),
         assumptions(assumptions), axisAnalysisPass(axisAnalysisPass),
-        solver(std::move(solver)) {}
+        solver(std::move(solver)), isaFamily(isaFamily) {}
 
   mlir::LogicalResult
   matchAndRewrite(triton::AtomicRMWOp op,
@@ -323,6 +323,14 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW
     }
     LDBG("RMW supported type");
 
+    // float16 is the only 16-bit dtype supported by buffer atomic fadd on
+    // gfx942
+    if (isaFamily == ISAFamily::CDNA3 && checkType.isBF16() &&
+        atomicRmwOp == RMWOp::FADD) {
+      return rewriter.notifyMatchFailure(op, "RMW FADD does not support bf16");
+    }
+    LDBG("RMW FADD supported 16-bit type");
+
     auto vecSize = getVectorSize(ptr, axisAnalysisPass);
     // f16/bf16 dtypes could only be efficiently calculated using instructions
     // that pack 2 elements (e.g. @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16)
@@ -387,6 +395,7 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW
   DenseMap<Value, SetVector<Operation *>> assumptions;
   ModuleAxisInfoAnalysis &axisAnalysisPass;
   std::shared_ptr<DataFlowSolver> solver;
+  ISAFamily isaFamily;
 };
 
 // Workaround to allow static_assert(false) on older compilers as it was
@@ -541,9 +550,11 @@ class TritonAMDGPUConvertToBufferOpsPass
     // Gate buffer atomics behind CDNA3 for now
     // GFX942-specific assumptions regarding cache coherence are made when
     // lowering to LLVM
-    if (ISAFamily::CDNA3 == triton::AMD::deduceISAFamily(archGenerationName))
+    triton::AMD::ISAFamily isaFamily =
+        triton::AMD::deduceISAFamily(archGenerationName);
+    if (ISAFamily::CDNA3 == isaFamily)
       patterns.add<ConvertTritonAtomicRMWOpToBufferAtomicRMW>(
-          context, assumptions, axisInfoAnalysis, solver);
+          context, assumptions, axisInfoAnalysis, solver, isaFamily);
 
     if (applyPatternsGreedily(mod, std::move(patterns)).failed())
       signalPassFailure();