Merge commit '36b347301e182e7cfea862caa6805aa8cf4045ec'

anmyachev · anmyachev · commit f88edfaf73dc · 2025-07-03T18:54:05.000Z
diff --git a/include/triton/Analysis/Membar.h b/include/triton/Analysis/Membar.h
@@ -119,6 +119,8 @@ class MembarOrFenceAnalysis {
   explicit MembarOrFenceAnalysis(Allocation *allocation, MembarFilterFn filter)
       : allocation(allocation), filter(filter) {}
 
+  virtual ~MembarOrFenceAnalysis() = default;
+
   /// Runs the membar analysis to the given operation, inserts a barrier if
   /// necessary.
   void run(FuncBlockInfoMapT &funcBlockInfoMap);
@@ -160,6 +162,8 @@ class MembarAnalysis : public MembarOrFenceAnalysis {
   explicit MembarAnalysis(Allocation *allocation, MembarFilterFn filter)
       : MembarOrFenceAnalysis(allocation, filter) {}
 
+  ~MembarAnalysis() override = default;
+
 private:
   /// Updates the BlockInfo operation based on the operation.
   virtual void update(Operation *operation, BlockInfo *blockInfo,
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h b/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
@@ -51,8 +51,8 @@ struct TensorMemory : public SideEffects::Resource::Base<TensorMemory> {
 struct TMemAllocation {
   TMemAllocation(int numCols, int numRows)
       : numCols(numCols), numRows(numRows) {}
-  int numRows;
   int numCols;
+  int numRows;
 };
 
 TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1751,6 +1751,36 @@ def kernel(X, val, NUM: tl.constexpr):
     torch.testing.assert_close(ref, x.reshape(math.prod(shape)))
 
 
+@pytest.mark.interpreter
+@pytest.mark.parametrize("size, num_ctas, dtype_x_str", [(size, num_ctas, dtype_x_str)
+                                                         for size in [2, 4, 8, 32, 64, 128]
+                                                         for num_ctas in num_ctas_list
+                                                         for dtype_x_str in ['bfloat16', 'float16', 'float32']])
+def test_tensor_atomic_add_shift_1(size, num_ctas, dtype_x_str, device):
+    check_type_supported(dtype_x_str, device)
+
+    @triton.jit
+    def kernel(X, val, NUM: tl.constexpr):
+        off_x = tl.arange(0, 2)
+        off_y = tl.arange(0, NUM)
+        off_in = off_x[:, None] * NUM + off_y[None, :]
+        off_out = off_x[:, None] + off_y[None, :]
+
+        val = tl.load(val + off_in)
+        tl.atomic_add(X + off_out, val)
+
+    s = (2, size)
+    dtype = getattr(torch, dtype_x_str)
+    x = torch.zeros(s, dtype=dtype, device=device)
+    ref = torch.flatten(x)
+    val = torch.randn(s, dtype=dtype, device=device)
+    kernel[(1, )](x, val, size, num_warps=1, num_ctas=num_ctas)
+    val = torch.flatten(val)
+    ref[0:size] = val[0:size]
+    ref[1:size + 1] += val[size:2 * size]
+    torch.testing.assert_close(ref, torch.flatten(x))
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize("shape, idx_order, mask_step, num_ctas, dtype_x_str",
                          [(shape, idx_order, mask_step, num_ctas, dtype_x_str)
diff --git a/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir b/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir
@@ -20,7 +20,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     // CHECK-SAME: with 273, 15, 15, true : f32
     // CHECK-NEXT: llvm.intr.maxnum
 
-    // CHECK: llvm.amdgcn.permlanex16
+    // CHECK: rocdl.permlanex16
     // CHECK: llvm.intr.maxnum
     // CHECK: rocdl.readlane
     %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
@@ -31,3 +31,33 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+#linear = #ttg.linear<{register = [[16, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1]], warp = [], block = []}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+// CHECK-LABEL: @reduce_linear_layout
+tt.func private @reduce_linear_layout(%arg0: tensor<32x2xi32, #linear>) -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #linear}>> {
+  // This tensor has 64 elements with the last dimension across the lower and upper 16 lanes.
+  // Therefore, we can reduce it with a 16 element butterfly shuffle.
+
+  // CHECK-DAG: [[result0:%.*]] = llvm.mlir.undef
+  // CHECK-DAG: [[select_lo:%.*]] = llvm.mlir.constant(1985229328 : i32)
+  // CHECK-DAG: [[select_hi:%.*]] = llvm.mlir.constant(-19088744 : i32)
+  // CHECK-DAG: [[reg0:%.*]] = llvm.extractvalue %arg0[0]
+  // CHECK-DAG: [[reg1:%.*]] = llvm.extractvalue %arg0[1]
+  // CHECK: [[permlane0:%.*]] = rocdl.permlanex16 [[reg0]], [[reg0]], [[select_lo]], [[select_hi]], true, false
+  // CHECK: [[sum0:%.*]] = llvm.add [[reg0]], [[permlane0]]
+  // CHECK: [[permlane1:%.*]] = rocdl.permlanex16 [[reg1]], [[reg1]], [[select_lo]], [[select_hi]], true, false
+  // CHECK: [[sum1:%.*]] = llvm.add [[reg1]], [[permlane1]]
+  // CHECK: [[result1:%.*]] = llvm.insertvalue [[sum0]], [[result0]][0]
+  // CHECK: [[result2:%.*]] = llvm.insertvalue [[sum1]], [[result1]][1]
+
+  %0 = "tt.reduce"(%arg0) ({
+  ^bb0(%arg1: i32, %arg2: i32):
+    %1 = arith.addi %arg1, %arg2 : i32
+    tt.reduce.return %1 : i32
+  }) {axis = 1 : i32} : (tensor<32x2xi32, #linear>) -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #linear}>>
+
+  // CHECK: llvm.return [[result2]]
+  tt.return %0 : tensor<32xi32, #ttg.slice<{dim = 1, parent = #linear}>>
+}
+}
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h b/third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h
@@ -23,6 +23,10 @@ ISAFamily deduceISAFamily(llvm::StringRef arch);
 // Retursn true if given architecture support V_DOT instruction.
 bool supportsVDot(llvm::StringRef arch);
 
+bool isCDNA(ISAFamily isaFamily);
+
+bool isRDNA(ISAFamily isaFamily);
+
 // Here is a partial definition of DppCtrl enums. For the complete definition,
 // please check:
 // https://github.com/llvm/llvm-project/blob/8c75290/llvm/lib/Target/AMDGPU/SIDefines.h#L939
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.cpp
@@ -207,8 +207,7 @@ Value AtomicRMWEmitter::emitAtomicRMW(RewriterBase &rewriter, Value rmwPtr,
 
 Value AtomicRMWEmitter::emitPairedAtomicForEvenTID(RewriterBase &rewriter,
                                                    Value rmwPtr, Value valElem,
-                                                   Value rmwMask,
-                                                   bool checkPairs) const {
+                                                   Value rmwMask) const {
   auto loc = rmwPtr.getLoc();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   Value i64Ones = b.i64_val(~uint64_t(0));
@@ -231,44 +230,34 @@ Value AtomicRMWEmitter::emitPairedAtomicForEvenTID(RewriterBase &rewriter,
   Value dppMoveRes = shiftLeftI32ByDpp(rewriter, packedVal);
   Value operand = b.bitcast(b.or_(packedVal, dppMoveRes), packF16Ty);
 
-  // If a runtime check is unnecessary (`checkPairs` is `false`),
-  // `rightNeighbourPtr` is irrelevant.
-  // Set the conditional value `enablePackedOpt` to `true` to enable DCE on the
-  // runtime check branch.
-  Value rightNeighbourPtr = rmwPtr;
-  Value enablePackedOpt = b.true_val();
-  if (checkPairs) {
-    Value rightNeighbourAddr =
-        genI32TiledOp(rewriter, shiftLeftI32ByDpp, castedAddr);
-
-    // Packing optimization only supported if following conditions are true:
-    // 1. address is aligned by 4 bytes
-    // 2. right neighbour has adjacent address
-    // 3. both threads are active
-    Value isAligned = b.icmp_eq(b.urem(castedAddr, b.i64_val(4)), b.i64_val(0));
-    Value neighbourAddrAdjacent = b.icmp_eq(
-        rightNeighbourAddr,
-        b.add(castedAddr, b.i64_val(valueElemTy.getIntOrFloatBitWidth() / 8)));
-    Value neighbourEnabled = b.icmp_ne(i64Ones, rightNeighbourAddr);
-    Value bothEnabled = b.and_(neighbourEnabled, rmwMask);
-    enablePackedOpt =
-        b.and_(b.and_(isAligned, bothEnabled), neighbourAddrAdjacent);
-
-    // Enable only the even threads.
-    Value anyEnabled = b.or_(neighbourEnabled, rmwMask);
-    // If one of the threads is disabled, use the neighbour's addr.
-    rightNeighbourAddr =
-        b.select(neighbourEnabled, rightNeighbourAddr, castedAddr);
-    castedAddr = b.select(rmwMask, castedAddr, rightNeighbourAddr);
-
-    rmwMask = b.and_(anyEnabled, b.icmp_eq(isOddI32, b.i32_val(0)));
-
-    // Unpack results back
-    rightNeighbourPtr = b.inttoptr(rmwPtr.getType(), rightNeighbourAddr);
-    rmwPtr = b.inttoptr(rmwPtr.getType(), castedAddr);
-  } else {
-    rmwMask = b.and_(rmwMask, b.icmp_eq(isOddI32, b.i32_val(0)));
-  }
+  Value rightNeighbourAddr =
+      genI32TiledOp(rewriter, shiftLeftI32ByDpp, castedAddr);
+
+  // Packing optimization only supported if following conditions are true:
+  // 1. address is aligned by 4 bytes
+  // 2. right neighbour has adjacent address
+  // 3. both threads are active
+  Value isAligned = b.icmp_eq(b.urem(castedAddr, b.i64_val(4)), b.i64_val(0));
+  Value neighbourAddrAdjacent = b.icmp_eq(
+      rightNeighbourAddr,
+      b.add(castedAddr, b.i64_val(valueElemTy.getIntOrFloatBitWidth() / 8)));
+  Value neighbourEnabled = b.icmp_ne(i64Ones, rightNeighbourAddr);
+  Value bothEnabled = b.and_(neighbourEnabled, rmwMask);
+  Value enablePackedOpt =
+      b.and_(b.and_(isAligned, bothEnabled), neighbourAddrAdjacent);
+
+  // Enable only the even threads.
+  Value anyEnabled = b.or_(neighbourEnabled, rmwMask);
+  // If one of the threads is disabled, use the neighbour's addr.
+  rightNeighbourAddr =
+      b.select(neighbourEnabled, rightNeighbourAddr, castedAddr);
+  castedAddr = b.select(rmwMask, castedAddr, rightNeighbourAddr);
+
+  rmwMask = b.and_(anyEnabled, b.icmp_eq(isOddI32, b.i32_val(0)));
+
+  // Unpack results back
+  Value rightNeighbourPtr = b.inttoptr(rmwPtr.getType(), rightNeighbourAddr);
+  rmwPtr = b.inttoptr(rmwPtr.getType(), castedAddr);
 
   Value undefVal = b.undef(packF16Ty);
   // Build blocks to bypass the atomic instruction for ~rmwMask.
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.h b/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.h
@@ -21,8 +21,7 @@ class AtomicRMWEmitter {
                       bool enableIntraWaveReduce) const;
 
   Value emitPairedAtomicForEvenTID(RewriterBase &rewriter, Value rmwPtr,
-                                   Value valElem, Value rmwMask,
-                                   bool checkPairs = true) const;
+                                   Value valElem, Value rmwMask) const;
 
 private:
   const mlir::triton::AMD::TargetInfo &targetInfo;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1489,7 +1489,6 @@ struct AtomicRMWOpConversion
     // TODO: support data types less than 32 bits
     enableIntraWaveReduce &= valueElemTy.getIntOrFloatBitWidth() >= 32;
 
-    bool checkPairs = true;
     if (tensorTy) {
       bool isF16Ty = valueElemTy.isF16() || valueElemTy.isBF16();
       unsigned availableVecSize = isF16Ty ? 2 : 1;
@@ -1505,7 +1504,6 @@ struct AtomicRMWOpConversion
       auto threadOrder = getThreadOrder(tensorTy);
       unsigned contigWithinLanes =
           axisAnalysisPass.getAxisInfo(ptr)->getContiguity(threadOrder.front());
-      checkPairs = !(contigWithinLanes > 1 && contigWithinLanes % 2 == 0);
       enableIntraWaveReduce &= contigWithinLanes == 1;
     }
 
@@ -1530,7 +1528,7 @@ struct AtomicRMWOpConversion
       Value rmwMask = llMask ? b.and_(mask, maskElements[i]) : mask;
       if (applyPackingF16) {
         resultVals[i] = emitter.emitPairedAtomicForEvenTID(
-            rewriter, ptrElements[i], valElements[i], rmwMask, checkPairs);
+            rewriter, ptrElements[i], valElements[i], rmwMask);
       } else {
         Value valElement;
         if (vec == 1) {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -66,34 +66,7 @@ llvm::AMDGPU::GPUKind TargetInfo::getGPUKind() const {
   return llvm::AMDGPU::parseArchAMDGCN(arch);
 }
 
-bool TargetInfo::isCDNA() const {
-  switch (getISAFamily()) {
-  case ISAFamily::CDNA1:
-  case ISAFamily::CDNA2:
-  case ISAFamily::CDNA3:
-  case ISAFamily::CDNA4:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-bool TargetInfo::isRDNA() const {
-  switch (getISAFamily()) {
-  case ISAFamily::RDNA1:
-  case ISAFamily::RDNA2:
-  case ISAFamily::RDNA3:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-int TargetInfo::getWarpSize() const { return isCDNA() ? 64 : 32; }
+int TargetInfo::getWarpSize() const { return isCDNA(getISAFamily()) ? 64 : 32; }
 
 int TargetInfo::getSharedMemorySize() const {
   int kbytes = getISAFamily() == ISAFamily::CDNA4 ? 160 : 64;
@@ -312,14 +285,14 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
                             unsigned interleave) const {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-  if (isCDNA() && getISAFamily() == ISAFamily::CDNA4 &&
+  if (getISAFamily() == ISAFamily::CDNA4 &&
       warpReduceSwap16or32(rewriter, loc, acc, op, numLaneToReduce, interleave))
     return true;
   if (numLaneToReduce != getWarpSize())
     return false;
-  if (isCDNA() && getISAFamily() == ISAFamily::CDNA1)
+  if (isCDNA(getISAFamily()) && getISAFamily() == ISAFamily::CDNA1)
     return false;
-  if (isRDNA() && getISAFamily() != ISAFamily::RDNA3)
+  if (isRDNA(getISAFamily()) && getISAFamily() != ISAFamily::RDNA3)
     return false;
 
   Operation *reduxOp = op.getSingleCombiner();
@@ -420,7 +393,7 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
     buf = createDppReduxOpWithBoundCtrl(valType, buf, 1 + dppCtrlRowShr,
                                         allRows, allBanks);
 
-    if (isCDNA()) {
+    if (isCDNA(getISAFamily())) {
       // row_bcast:15 row_mask:0xa
       buf = createDppReduxOpWithBoundCtrl(
           valType, buf, static_cast<uint32_t>(DppCtrl::BCAST15), 0xa, allBanks);
@@ -433,12 +406,12 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
       // RDNA doesn't have broadcast dpp mode
       Type actualType = castToAndSExtInt(rewriter, loc, buf, valType, 32);
 
-      Value permlaneResult =
-          LLVM::createLLVMIntrinsicCallOp(
-              rewriter, loc, "llvm.amdgcn.permlanex16", actualType,
-              ValueRange{buf, buf, b.i32_val(-1), b.i32_val(-1), b.true_val(),
-                         b.false_val()})
-              ->getResult(0);
+      // Lanes 0-15 read from lane 31 and lanes 16-31 read from lane 15.
+      Value permlaneResult = rewriter
+                                 .create<ROCDL::PermlaneX16Op>(
+                                     loc, actualType, buf, buf, b.i32_val(-1),
+                                     b.i32_val(-1), true, false)
+                                 .getRes();
       buf = truncAndCastFromInt(rewriter, loc, buf, valType, 32);
       permlaneResult =
           truncAndCastFromInt(rewriter, loc, permlaneResult, valType, 32);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -15,10 +15,6 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   llvm::AMDGPU::GPUKind getGPUKind() const;
 
-  bool isCDNA() const;
-
-  bool isRDNA() const;
-
   int getWarpSize() const;
 
   int getSharedMemorySize() const;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp
@@ -49,4 +49,31 @@ bool supportsVDot(llvm::StringRef arch) {
   return false;
 }
 
+bool isCDNA(ISAFamily isaFamily) {
+  switch (isaFamily) {
+  case ISAFamily::CDNA1:
+  case ISAFamily::CDNA2:
+  case ISAFamily::CDNA3:
+  case ISAFamily::CDNA4:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool isRDNA(ISAFamily isaFamily) {
+  switch (isaFamily) {
+  case ISAFamily::RDNA1:
+  case ISAFamily::RDNA2:
+  case ISAFamily::RDNA3:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 } // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -1,6 +1,7 @@
 #include "Utility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "TritonAMDGPUToLLVM/GCNAsmFormat.h"
+#include "TritonAMDGPUToLLVM/TargetUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/PatternMatch.h"
@@ -137,8 +138,17 @@ static Value shuffleCommonImpl(Location loc, RewriterBase &rewriter,
       Value lineId = b.xor_(threadId, stride);
       return bpermute(lineId);
     } else if (strideInt == 16) {
-      Value offset = b.i32_val(0x401F);
-      return rewriter.create<ROCDL::DsSwizzleOp>(loc, valType, val, offset);
+      if (isRDNA(isaFamily)) {
+        // Lane i in the upper 16 lanes reads the value from lane i in the lower
+        // 16 lanes and vice versa.
+        Value select_lo = b.i32_val(0x76543210);
+        Value select_hi = b.i32_val(0xfedcba98);
+        return rewriter.create<ROCDL::PermlaneX16Op>(
+            loc, valType, val, val, select_lo, select_hi, true, false);
+      } else {
+        Value offset = b.i32_val(0x401F);
+        return rewriter.create<ROCDL::DsSwizzleOp>(loc, valType, val, offset);
+      }
     } else {
       if (!llvm::is_contained({ISAFamily::CDNA2, ISAFamily::CDNA3,
                                ISAFamily::CDNA4, ISAFamily::RDNA3},
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp
@@ -119,7 +119,7 @@ struct TritonAMDGPUUpdateAsyncWaitCountPass
 
   void runOnOperation() override {
     tt::AMD::TargetInfo targetInfo(archGenerationName);
-    if (!targetInfo.isCDNA()) {
+    if (!isCDNA(targetInfo.getISAFamily())) {
       return;
     }