[AMD] Fix f16/bf16 buffer atomic operations properly (#6139)

joviliast · web-flow · commit 86906cf32cfa · 2025-03-07T07:59:05.000-08:00
Reverted the workaround that disabled f16/bf16 buffer atomic operations: #6090. Added an additional check for vector size, ensuring applicability of packed instructions for f16/bf16. See also: * #6090 * #6126 - rootcause fix Signed-off-by: Ilya Veselov <iveselov.nn@gmail.com>
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
@@ -313,18 +313,23 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW
     // 4. Buffer atomic RMW does not support FP8 ops
     //    easier to just check what we support
     auto checkType = getElementTypeOrSelf(op.getVal());
-    // TODO: F16 and BF16 data types are supported by intrinsics with packed
-    // arithmetic on adjacent addresses, requiring the leading address to be
-    // 4-byte aligned. A runtime check should be implemented to enforce this
-    // requirement and ensure fallback to regular atomic operations when
-    // alignment is not met.
-    bool isSupportedType = checkType.isF32() || checkType.isF64() ||
+    bool isSupportedType = checkType.isF16() || checkType.isBF16() ||
+                           checkType.isF32() || checkType.isF64() ||
                            checkType.isInteger(32) || checkType.isInteger(64);
     if (!isSupportedType) {
       return rewriter.notifyMatchFailure(op, "RMW with unsupported type");
     }
     LDBG("RMW supported type");
 
+    auto vecSize = getVectorSize(ptr, axisAnalysisPass);
+    // f16/bf16 dtypes could only be efficiently calculated using instructions
+    // that pack 2 elements (e.g. @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16)
+    if (vecSize % 2 != 0 && (checkType.isF16() || checkType.isBF16())) {
+      return rewriter.notifyMatchFailure(
+          op, "RMW float 16 dtypes must be aligned by 2");
+    }
+    LDBG("RMW passed alignment check");
+
     // 5. Check if the RMWOp is supported
     switch (atomicRmwOp) {
     case RMWOp::AND:
@@ -355,8 +360,7 @@ struct ConvertTritonAtomicRMWOpToBufferAtomicRMW
       // are contiguous we can emit the buffer op. Otherwise, the buffer ops
       // lowering will try to emit individual (unsupported) f16/bf16 ops.
       auto elemBitWidth = tensorType.getElementTypeBitWidth();
-      opBitWidth =
-          getVectorSize(basePtr, tensorOffset, axisAnalysisPass) * elemBitWidth;
+      opBitWidth = vecSize * elemBitWidth;
     } else {
       opBitWidth = opValueType.getIntOrFloatBitWidth();
     }