[AMD] Fix packed fp16 atomic optimization conditions (triton-lang#5839)

joviliast · antiagainst · scxiao · loislo · commit 16c3c1aa3d7d · 2025-03-04T16:28:38.000+01:00
We need to make sure threads in a pair are both active
and the address is aligned to 4 bytes.

---------

Signed-off-by: Ilya Veselov &lt;iveselov.nn@gmail.com&gt;
Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
Co-authored-by: Shucai Xiao &lt;shucai.xiao@amd.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1655,6 +1655,79 @@ def torch_to_triton_dtype(t):
         np.testing.assert_equal(old_ref, to_numpy(old_tri))
 
 
+@pytest.mark.interpreter
+@pytest.mark.parametrize("size, num_ctas, dtype_x_str", [(size, num_ctas, dtype_x_str)
+                                                         for size in [2, 4, 8, 32, 64, 128]
+                                                         for num_ctas in num_ctas_list
+                                                         for dtype_x_str in ['float16']])
+def test_tensor_atomic_add_non_exclusive_offset(size, num_ctas, dtype_x_str, device):
+
+    @triton.jit
+    def kernel(X, val, NUM: tl.constexpr):
+        off = tl.arange(0, NUM)
+        offset = off[:, None] * NUM + off[None, :]
+        val = tl.load(val + offset)
+        tl.atomic_add(X + offset // 2, val)
+
+    shape = (size // 2, size)
+    x = torch.zeros(shape, dtype=getattr(torch, dtype_x_str), device=device)
+    val = torch.randn((size**2), dtype=getattr(torch, dtype_x_str), device=device)
+    kernel[(1, )](x, val, size, num_warps=1, num_ctas=num_ctas)
+    ref = val[0::2] + val[1::2]
+    torch.testing.assert_close(ref, x.reshape(math.prod(shape)))
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("shape, idx_order, mask_step, num_ctas, dtype_x_str",
+                         [(shape, idx_order, mask_step, num_ctas, dtype_x_str)
+                          for shape in [(2, 2), (5, 5), (6, 6), (8, 8)]
+                          for idx_order in ['increase', 'decrease', 'random_no_duplication', 'random']
+                          for mask_step in range(1, 5)
+                          for num_ctas in num_ctas_list
+                          for dtype_x_str in ['float16']])
+def test_tensor_atomic_add_access_patterns(shape, idx_order, mask_step, num_ctas, dtype_x_str, device):
+    check_type_supported(dtype_x_str, device)
+    if is_interpreter():
+        pytest.skip("not supported in the interpreter")
+
+    @triton.jit
+    def kernel(in_ptr, idx_ptr, out_ptr, shape0, shape1, mask_step, XBLOCK: tl.constexpr):
+        xoffset = tl.program_id(0) * XBLOCK
+        x_idx = xoffset + tl.arange(0, XBLOCK)[:]
+        mask = x_idx < shape0 * shape1
+        mask = mask and (x_idx % mask_step != 0)
+        idx_base = shape1 * (x_idx // shape1)
+        idx_offset = tl.load(idx_ptr + x_idx, mask)
+        in_elem = tl.load(in_ptr + x_idx, mask)
+        tl.atomic_add(out_ptr + (idx_offset + idx_base), in_elem, mask, sem='relaxed')
+
+    shape0, shape1 = shape
+    idx_row = torch.arange(0, shape1, device=device)
+    if idx_order == 'increase':
+        idx = torch.stack([idx_row.repeat_interleave(i + 1)[:shape1] for i in range(shape0)])
+    if idx_order == 'decrease':
+        idx = torch.stack([idx_row.flip(0).repeat_interleave(i + 1)[:shape1] for i in range(shape0)])
+    if idx_order == 'random_no_duplication':
+        idx = torch.stack([torch.randperm(shape1, device=device) for _ in idx_row])
+    if idx_order == 'random':
+        idx = torch.randint(0, shape1, size=(shape0, shape1), device=device)
+
+    val = torch.randn((shape0, shape1), dtype=getattr(torch, dtype_x_str), device=device)
+    dst = torch.randn((shape0, shape1), dtype=getattr(torch, dtype_x_str), device=device)
+
+    dst_ref = dst.clone()
+
+    cnt = 0
+    for i, row in enumerate(idx):
+        for j, elem in enumerate(row):
+            if cnt % mask_step != 0:
+                dst_ref[i][elem] += val[i][j]
+            cnt += 1
+
+    kernel[(1, )](val, idx, dst, shape0, shape1, mask_step, 64, num_ctas=num_ctas)
+    np.testing.assert_allclose(to_numpy(dst_ref), to_numpy(dst), atol=1e-2)
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize("num_ctas", num_ctas_list)
 def test_tensor_atomic_rmw_block(num_ctas, device):
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -176,8 +176,7 @@ struct LoadStoreConversionBase {
     return axisAnalysisPass.getMaskAlignment(mask);
   }
 
-  std::optional<const std::string>
-  getAMDGPUMemScopeStr(MemSyncScope scope) const {
+  std::optional<const char *> getAMDGPUMemScopeStr(MemSyncScope scope) const {
     // See: https://llvm.org/docs/AMDGPUUsage.html#memory-scopes
     auto scopeStr = "";
     switch (scope) {
@@ -1295,44 +1294,43 @@ struct AtomicRMWOpConversion
     // tt::atomicRmwOp(%ptr, %val, %mask):
     // 0. Group thread by pairs. Master thread is (tid % 2 == 0);
     // 1. All the threads send %val to (tid - 1) thread via dppUpdateOp shl, so
-    //    all the masters recieve value from secondary threads;
+    //    all the masters receive value from secondary threads;
     // 2. Take into account parity in the %mask value, build control flow
     //    structures according to it;
     // 3. Generate llvm::atomicRmwOp in the threads enabled by %mask value;
     // 4. All the threads send result of generated operation to (tid + 1) thread
-    //    via dppUpdateOp shl, so all secondary thread also recieve their
+    //    via dppUpdateOp shl, so all secondary thread also receive their
     //    result.
     //
     // This approach enables us to use half the active threads committing atomic
     // requests to avoid generating of code providing unified access to f16
-    // element and reduce contantion.
+    // element and reduce contention.
     bool useDppForPackedF16 = false;
     // tensor
     if (tensorTy) {
       auto valTy = cast<RankedTensorType>(val.getType());
       bool isF16Ty = valueElemTy.isF16() || valueElemTy.isBF16();
       unsigned availableVecSize = isF16Ty ? 2 : 1;
       vec = std::min<unsigned>(vec, availableVecSize);
-      // Force F16 packing  in the case it's not comming in as packed, but the
+      // Force F16 packing in the case it's not coming in as packed, but the
       // ISA can support packed atomic instructions.
       useDppForPackedF16 =
           supportsGlobalAtomicF16PackedAndDpp(targetInfo.getISAFamily()) &&
-          vec == 1 && isF16Ty && atomicRmwAttr == RMWOp::FADD;
+          vec == 1 && isF16Ty && atomicRmwAttr == RMWOp::FADD &&
+          !enableIntraWaveReduce;
       // mask
       numElems = tensorTy.getNumElements();
     }
-    Value mask = b.int_val(1, 1);
+    Value mask = b.true_val();
     auto tid = getThreadId(rewriter, loc);
     mask = b.and_(mask, b.icmp_slt(b.mul(tid, b.i32_val(elemsPerThread)),
                                    b.i32_val(numElems)));
-    if (useDppForPackedF16)
-      mask = b.and_(mask, b.icmp_eq(b.urem(tid, b.i32_val(2)), b.i32_val(0)));
 
     auto memOrdering = op.getSem();
     auto scope = op.getScope();
     auto atomicMemOrdering = getMemoryOrdering(memOrdering);
 
-    auto scopeStr = getAMDGPUMemScopeStr(scope);
+    std::optional<const char *> scopeStr = getAMDGPUMemScopeStr(scope);
     if (!scopeStr)
       return rewriter.notifyMatchFailure(op, "Unknown AMDGPU memory scope");
 
@@ -1346,24 +1344,59 @@ struct AtomicRMWOpConversion
       // elemsPerThread.
       Value rmwMask = llMask ? b.and_(mask, maskElements[i]) : mask;
 
+      Value i64Ones = b.i64_val(~uint64_t(0));
+      Value i64Zeros = b.i64_val(0);
       Value operand;
+      Value rightNeighbourPtr;
+      Value enablePackedOpt;
       if (useDppForPackedF16) {
+        Value isOddI32 = b.urem(tid, b.i32_val(2));
+        // First check if odd threads hold adjacent ptrs to even ones.
+        Value castedAddr = b.ptrtoint(i64_ty, rmwPtr);
+        // Set casted addr to all ones if the thread is disabled.
+        castedAddr = b.select(rmwMask, castedAddr, i64Ones);
+
         // Move %val to left neighbour to proceed packed atomic further.
         Value packedVal = b.null(packF16Ty);
-        packedVal = b.insert_element(packF16Ty, packedVal, valElements[i],
-                                     b.i32_val(0));
-        // Pack to i32 type to simplify transaction
+        packedVal =
+            b.insert_element(packF16Ty, packedVal, valElements[i], isOddI32);
+        // Pack to i32 type to simplify transaction.
         packedVal = b.bitcast(packedVal, i32_ty);
+        // Zero operands for disabled threads to make addition no op.
+        packedVal = b.select(rmwMask, packedVal, b.i32_val(0));
         Value dppMoveRes = shiftLeftI32ByDpp(rewriter, packedVal);
+
+        Value rightNeighbourAddr =
+            genI32TiledOp(rewriter, shiftLeftI32ByDpp, castedAddr);
+
+        // Packing optimization only supported if following conditions are true:
+        // 1. address is aligned by 4 bytes
+        // 2. right neighbour has adjacent address
+        // 3. both threads are active
+        Value isAligned =
+            b.icmp_eq(b.urem(castedAddr, b.i64_val(4)), b.i64_val(0));
+        Value neighbourAddrAdjacent = b.icmp_eq(
+            rightNeighbourAddr,
+            b.add(castedAddr,
+                  b.i64_val(valueElemTy.getIntOrFloatBitWidth() / 8)));
+        Value neighbourEnabled = b.icmp_ne(i64Ones, rightNeighbourAddr);
+        Value bothEnabled = b.and_(neighbourEnabled, rmwMask);
+        enablePackedOpt =
+            b.and_(b.and_(isAligned, bothEnabled), neighbourAddrAdjacent);
+
+        // Enable only the even threads.
+        Value anyEnabled = b.or_(neighbourEnabled, rmwMask);
+        // If one of the threads is disabled, use the neighbour's addr.
+        rightNeighbourAddr =
+            b.select(neighbourEnabled, rightNeighbourAddr, castedAddr);
+        castedAddr = b.select(rmwMask, castedAddr, rightNeighbourAddr);
+
+        rmwMask = b.and_(anyEnabled, b.icmp_eq(isOddI32, b.i32_val(0)));
+
         // Unpack results back
-        Value unpackedDppRes = b.bitcast(dppMoveRes, packF16Ty);
-        operand = b.undef(packF16Ty);
-        operand =
-            b.insert_element(packF16Ty, operand, valElements[i], b.i32_val(0));
-        operand = b.insert_element(
-            packF16Ty, operand,
-            b.extract_element(valueElemTy, unpackedDppRes, b.i32_val(0)),
-            b.i32_val(1));
+        rightNeighbourPtr = b.inttoptr(rmwPtr.getType(), rightNeighbourAddr);
+        rmwPtr = b.inttoptr(rmwPtr.getType(), castedAddr);
+        operand = b.bitcast(b.or_(packedVal, dppMoveRes), packF16Ty);
       } else if (vec == 1) {
         operand = valElements[i];
       } else {
@@ -1388,15 +1421,47 @@ struct AtomicRMWOpConversion
       rewriter.setInsertionPointToEnd(atomicBlock);
       auto maybeKind = matchAtomicOp(atomicRmwAttr);
       Value atom;
+      Value isVecOp;
       if (enableIntraWaveReduce) {
         atom = atomicIntraWaveReduce(rewriter, rmwPtr, operand, *maybeKind,
-                                     atomicMemOrdering, scopeStr.value());
+                                     atomicMemOrdering, *scopeStr);
       } else {
-        atom = rewriter
-                   .create<LLVM::AtomicRMWOp>(loc, *maybeKind, rmwPtr, operand,
-                                              atomicMemOrdering,
-                                              StringRef(scopeStr.value()))
-                   .getResult();
+        if (useDppForPackedF16) {
+          // Determine on the runtime what atomic intrinsic to execute:
+          // packed or regular.
+          auto *packedBlock =
+              atomicBlock->splitBlock(rewriter.getInsertionPoint());
+          auto *regularBlock =
+              rewriter.createBlock(atomicBlock->getParent(),
+                                   std::next(Region::iterator(atomicBlock)));
+          rewriter.setInsertionPointToEnd(atomicBlock);
+          rewriter.create<LLVM::CondBrOp>(loc, enablePackedOpt, packedBlock,
+                                          regularBlock);
+
+          // Fill out the regular block, where we issue two atomic ops.
+          rewriter.setInsertionPointToEnd(regularBlock);
+          Value pairedOperand0 =
+              b.extract_element(valueElemTy, operand, b.i32_val(0));
+          Value pairedOperand1 =
+              b.extract_element(valueElemTy, operand, b.i32_val(1));
+          Value atomNonVec0 = rewriter.create<LLVM::AtomicRMWOp>(
+              loc, *maybeKind, rmwPtr, pairedOperand0, atomicMemOrdering,
+              *scopeStr);
+          Value atomNonVec1 = rewriter.create<LLVM::AtomicRMWOp>(
+              loc, *maybeKind, rightNeighbourPtr, pairedOperand1,
+              atomicMemOrdering, *scopeStr);
+          Value packedRes = b.undef(packF16Ty);
+          packedRes =
+              b.insert_element(packF16Ty, packedRes, atomNonVec0, b.i32_val(0));
+          packedRes =
+              b.insert_element(packF16Ty, packedRes, atomNonVec1, b.i32_val(1));
+          rewriter.create<LLVM::BrOp>(loc, packedRes, endBlock);
+
+          // Start to fill out the packed block.
+          rewriter.setInsertionPointToEnd(packedBlock);
+        }
+        atom = rewriter.create<LLVM::AtomicRMWOp>(
+            loc, *maybeKind, rmwPtr, operand, atomicMemOrdering, *scopeStr);
       }
 
       if (!tensorTy) {
@@ -1623,11 +1688,8 @@ struct AtomicRMWOpConversion
     rewriter.setInsertionPointToEnd(leaderBlock);
     // Utilize global atomic only by leader threads
     rmwPtr = b.inttoptr(origPtrType, rmwPtr);
-    Value atom = rewriter
-                     .create<LLVM::AtomicRMWOp>(loc, opKind, rmwPtr,
-                                                afterRedBlock->getArgument(0),
-                                                memOrdering, scope)
-                     .getResult();
+    Value atom = rewriter.create<LLVM::AtomicRMWOp>(
+        loc, opKind, rmwPtr, afterRedBlock->getArgument(0), memOrdering, scope);
     rewriter.create<LLVM::BrOp>(loc, atom, endBlock);
     rewriter.setInsertionPointToStart(endBlock);