[AMD] Always CheckPairs for packed fp16/bf16 atomic instructions (#7326)

scxiao · web-flow · commit 36b347301e18 · 2025-06-26T19:26:18.000Z
When using packed atomic ops, there are a few conditions to meet. The PR: triton-lang/triton#6258 to refactor these checking added a condition to skip checking the condition and it introduced a bug (it missed the case that even lane tid access addresses not 4-byte aligned), so we need to always do that check. This PR is to change to always run `checkpairs`.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1731,6 +1731,36 @@ def kernel(X, val, NUM: tl.constexpr):
     torch.testing.assert_close(ref, x.reshape(math.prod(shape)))
 
 
+@pytest.mark.interpreter
+@pytest.mark.parametrize("size, num_ctas, dtype_x_str", [(size, num_ctas, dtype_x_str)
+                                                         for size in [2, 4, 8, 32, 64, 128]
+                                                         for num_ctas in num_ctas_list
+                                                         for dtype_x_str in ['bfloat16', 'float16', 'float32']])
+def test_tensor_atomic_add_shift_1(size, num_ctas, dtype_x_str, device):
+    check_type_supported(dtype_x_str, device)
+
+    @triton.jit
+    def kernel(X, val, NUM: tl.constexpr):
+        off_x = tl.arange(0, 2)
+        off_y = tl.arange(0, NUM)
+        off_in = off_x[:, None] * NUM + off_y[None, :]
+        off_out = off_x[:, None] + off_y[None, :]
+
+        val = tl.load(val + off_in)
+        tl.atomic_add(X + off_out, val)
+
+    s = (2, size)
+    dtype = getattr(torch, dtype_x_str)
+    x = torch.zeros(s, dtype=dtype, device=device)
+    ref = torch.flatten(x)
+    val = torch.randn(s, dtype=dtype, device=device)
+    kernel[(1, )](x, val, size, num_warps=1, num_ctas=num_ctas)
+    val = torch.flatten(val)
+    ref[0:size] = val[0:size]
+    ref[1:size + 1] += val[size:2 * size]
+    torch.testing.assert_close(ref, torch.flatten(x))
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize("shape, idx_order, mask_step, num_ctas, dtype_x_str",
                          [(shape, idx_order, mask_step, num_ctas, dtype_x_str)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.cpp
@@ -207,8 +207,7 @@ Value AtomicRMWEmitter::emitAtomicRMW(RewriterBase &rewriter, Value rmwPtr,
 
 Value AtomicRMWEmitter::emitPairedAtomicForEvenTID(RewriterBase &rewriter,
                                                    Value rmwPtr, Value valElem,
-                                                   Value rmwMask,
-                                                   bool checkPairs) const {
+                                                   Value rmwMask) const {
   auto loc = rmwPtr.getLoc();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   Value i64Ones = b.i64_val(~uint64_t(0));
@@ -231,44 +230,34 @@ Value AtomicRMWEmitter::emitPairedAtomicForEvenTID(RewriterBase &rewriter,
   Value dppMoveRes = shiftLeftI32ByDpp(rewriter, packedVal);
   Value operand = b.bitcast(b.or_(packedVal, dppMoveRes), packF16Ty);
 
-  // If a runtime check is unnecessary (`checkPairs` is `false`),
-  // `rightNeighbourPtr` is irrelevant.
-  // Set the conditional value `enablePackedOpt` to `true` to enable DCE on the
-  // runtime check branch.
-  Value rightNeighbourPtr = rmwPtr;
-  Value enablePackedOpt = b.true_val();
-  if (checkPairs) {
-    Value rightNeighbourAddr =
-        genI32TiledOp(rewriter, shiftLeftI32ByDpp, castedAddr);
-
-    // Packing optimization only supported if following conditions are true:
-    // 1. address is aligned by 4 bytes
-    // 2. right neighbour has adjacent address
-    // 3. both threads are active
-    Value isAligned = b.icmp_eq(b.urem(castedAddr, b.i64_val(4)), b.i64_val(0));
-    Value neighbourAddrAdjacent = b.icmp_eq(
-        rightNeighbourAddr,
-        b.add(castedAddr, b.i64_val(valueElemTy.getIntOrFloatBitWidth() / 8)));
-    Value neighbourEnabled = b.icmp_ne(i64Ones, rightNeighbourAddr);
-    Value bothEnabled = b.and_(neighbourEnabled, rmwMask);
-    enablePackedOpt =
-        b.and_(b.and_(isAligned, bothEnabled), neighbourAddrAdjacent);
-
-    // Enable only the even threads.
-    Value anyEnabled = b.or_(neighbourEnabled, rmwMask);
-    // If one of the threads is disabled, use the neighbour's addr.
-    rightNeighbourAddr =
-        b.select(neighbourEnabled, rightNeighbourAddr, castedAddr);
-    castedAddr = b.select(rmwMask, castedAddr, rightNeighbourAddr);
-
-    rmwMask = b.and_(anyEnabled, b.icmp_eq(isOddI32, b.i32_val(0)));
-
-    // Unpack results back
-    rightNeighbourPtr = b.inttoptr(rmwPtr.getType(), rightNeighbourAddr);
-    rmwPtr = b.inttoptr(rmwPtr.getType(), castedAddr);
-  } else {
-    rmwMask = b.and_(rmwMask, b.icmp_eq(isOddI32, b.i32_val(0)));
-  }
+  Value rightNeighbourAddr =
+      genI32TiledOp(rewriter, shiftLeftI32ByDpp, castedAddr);
+
+  // Packing optimization only supported if following conditions are true:
+  // 1. address is aligned by 4 bytes
+  // 2. right neighbour has adjacent address
+  // 3. both threads are active
+  Value isAligned = b.icmp_eq(b.urem(castedAddr, b.i64_val(4)), b.i64_val(0));
+  Value neighbourAddrAdjacent = b.icmp_eq(
+      rightNeighbourAddr,
+      b.add(castedAddr, b.i64_val(valueElemTy.getIntOrFloatBitWidth() / 8)));
+  Value neighbourEnabled = b.icmp_ne(i64Ones, rightNeighbourAddr);
+  Value bothEnabled = b.and_(neighbourEnabled, rmwMask);
+  Value enablePackedOpt =
+      b.and_(b.and_(isAligned, bothEnabled), neighbourAddrAdjacent);
+
+  // Enable only the even threads.
+  Value anyEnabled = b.or_(neighbourEnabled, rmwMask);
+  // If one of the threads is disabled, use the neighbour's addr.
+  rightNeighbourAddr =
+      b.select(neighbourEnabled, rightNeighbourAddr, castedAddr);
+  castedAddr = b.select(rmwMask, castedAddr, rightNeighbourAddr);
+
+  rmwMask = b.and_(anyEnabled, b.icmp_eq(isOddI32, b.i32_val(0)));
+
+  // Unpack results back
+  Value rightNeighbourPtr = b.inttoptr(rmwPtr.getType(), rightNeighbourAddr);
+  rmwPtr = b.inttoptr(rmwPtr.getType(), castedAddr);
 
   Value undefVal = b.undef(packF16Ty);
   // Build blocks to bypass the atomic instruction for ~rmwMask.
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.h b/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.h
@@ -21,8 +21,7 @@ class AtomicRMWEmitter {
                       bool enableIntraWaveReduce) const;
 
   Value emitPairedAtomicForEvenTID(RewriterBase &rewriter, Value rmwPtr,
-                                   Value valElem, Value rmwMask,
-                                   bool checkPairs = true) const;
+                                   Value valElem, Value rmwMask) const;
 
 private:
   const mlir::triton::AMD::TargetInfo &targetInfo;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1489,7 +1489,6 @@ struct AtomicRMWOpConversion
     // TODO: support data types less than 32 bits
     enableIntraWaveReduce &= valueElemTy.getIntOrFloatBitWidth() >= 32;
 
-    bool checkPairs = true;
     if (tensorTy) {
       bool isF16Ty = valueElemTy.isF16() || valueElemTy.isBF16();
       unsigned availableVecSize = isF16Ty ? 2 : 1;
@@ -1505,7 +1504,6 @@ struct AtomicRMWOpConversion
       auto threadOrder = getThreadOrder(tensorTy);
       unsigned contigWithinLanes =
           axisAnalysisPass.getAxisInfo(ptr)->getContiguity(threadOrder.front());
-      checkPairs = !(contigWithinLanes > 1 && contigWithinLanes % 2 == 0);
       enableIntraWaveReduce &= contigWithinLanes == 1;
     }
 
@@ -1530,7 +1528,7 @@ struct AtomicRMWOpConversion
       Value rmwMask = llMask ? b.and_(mask, maskElements[i]) : mask;
       if (applyPackingF16) {
         resultVals[i] = emitter.emitPairedAtomicForEvenTID(
-            rewriter, ptrElements[i], valElements[i], rmwMask, checkPairs);
+            rewriter, ptrElements[i], valElements[i], rmwMask);
       } else {
         Value valElement;
         if (vec == 1) {