[AMD] Improved CanonicalizePointers for ExtractSlice

ravil-mobile · ravil-mobile · commit 31679301c137 · 2025-05-21T10:46:41.000Z
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -263,6 +263,7 @@ def make_ttgir(mod, metadata, options):
         use_block_pingpong = is_pingpong_schedule_enabled(options.arch)
         if use_block_pingpong and options.num_stages in [2, 4]:
             amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages, use_async_copy)
+            passes.ttgpuir.add_remove_layout_conversions(pm)
 
         if knobs.amd.use_buffer_ops:
             amd.passes.ttgpuir.add_canonicalize_pointers(pm)
diff --git a/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp
@@ -1,3 +1,4 @@
+#include "../TritonAMDGPUToLLVM/Utility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "TritonAMDGPUToLLVM/GCNAsmFormat.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
@@ -49,6 +50,7 @@ using namespace mlir::triton;
 // clang-format on
 
 namespace {
+
 struct ExtractSliceOpConversion
     : public ConvertOpToLLVMPattern<amdgpu::ExtractSliceOp> {
   explicit ExtractSliceOpConversion(LLVMTypeConverter &typeConverter,
@@ -60,61 +62,61 @@ struct ExtractSliceOpConversion
                               ConversionPatternRewriter &rewriter) const {
     Location loc = op->getLoc();
     auto srcTy = cast<RankedTensorType>(op.getSource().getType());
-    auto srcLayout = srcTy.getEncoding();
+    auto dstTy = cast<RankedTensorType>(op.getType());
     auto srcShape = srcTy.getShape();
-    auto resultTy = cast<RankedTensorType>(op.getType());
-    auto vals = unpackLLElements(loc, adaptor.getSource(), rewriter);
-    auto elemsPerThread = triton::gpu::getElemsPerThread(srcTy);
-    auto contigPerThread = triton::gpu::getContigPerThread(srcTy);
-    auto totalContigPerThread = product<unsigned>(contigPerThread);
-    auto order = triton::gpu::getOrder(srcTy);
+    auto dstShape = dstTy.getShape();
 
-    // Calculate valid total number of workers in each dimension
+    auto vals = unpackLLElements(loc, adaptor.getSource(), rewriter);
     auto shapePerCTATile = triton::gpu::getShapePerCTATile(srcTy);
-    shapePerCTATile[0] =
-        std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);
-    shapePerCTATile[1] =
-        std::min(static_cast<unsigned>(srcShape[1]), shapePerCTATile[1]);
-
-    // Rank == 2 checked in the verifier
-    SmallVector<int64_t, 2> sizes;
-    for (auto i = 0; i < 2; ++i) {
-      sizes.push_back(resultTy.getDimSize(i));
-    }
+    auto srcCTAShape = LLVM::AMD::multiDimElementwise<int64_t, unsigned>(
+        srcShape, shapePerCTATile, std::divides<unsigned>());
+    auto dstCTAShape = LLVM::AMD::multiDimElementwise<int64_t, unsigned>(
+        dstShape, shapePerCTATile, std::divides<unsigned>());
 
+    auto numCTATiles = std::accumulate(dstCTAShape.begin(), dstCTAShape.end(),
+                                       1, std::multiplies<>());
     auto offsets = op.getStaticOffsets();
+    auto firstTileCoordinate =
+        LLVM::AMD::multiDimElementwise<int64_t, unsigned>(
+            offsets, shapePerCTATile, std::divides<unsigned>());
 
-    // Calculate offsets and sizes in terms of CTA units.
-    std::array<int64_t, 2> CTAOffsets{offsets[0] / shapePerCTATile[0],
-                                      offsets[1] / shapePerCTATile[1]};
-    std::array<int64_t, 2> CTASizes{sizes[0] / shapePerCTATile[0],
-                                    sizes[1] / shapePerCTATile[1]};
-    std::array<int64_t, 2> CTAPerShape{srcShape[0] / shapePerCTATile[0],
-                                       srcShape[1] / shapePerCTATile[1]};
-
-    // The diagram above illustrates the graphical representation of the
-    // skipElems, tensorStride, and lastIdx variables.
-    auto skipElems = CTAOffsets[order[1]] * (elemsPerThread[order[0]] *
-                                             contigPerThread[order[1]]) +
-                     CTAOffsets[order[0]] * totalContigPerThread;
-    auto tensorStride =
-        (CTAPerShape[order[0]] - CTASizes[order[0]]) * totalContigPerThread;
-    auto lastIdx =
-        (CTAOffsets[order[1]] + CTASizes[order[1]] - 1) *
-            elemsPerThread[order[0]] * contigPerThread[order[1]] +
-        (CTAOffsets[order[0]] + CTASizes[order[0]]) * totalContigPerThread;
-
-    assert(lastIdx <= vals.size());
+    Attribute srcEncoding = srcTy.getEncoding();
+    Attribute dstEncoding = dstTy.getEncoding();
+    auto linearLayoutSrc = triton::gpu::toLinearLayout(srcShape, srcEncoding);
+    auto linearLayoutDst = triton::gpu::toLinearLayout(dstShape, dstEncoding);
 
+    auto srcCTAOrder =
+        LLVM::AMD::getCTATileOrder(srcTy.getContext(), linearLayoutSrc);
+    auto dstCTAOrder =
+        LLVM::AMD::getCTATileOrder(srcTy.getContext(), linearLayoutDst);
+
+    unsigned elemsPerThreadPerCTA =
+        triton::gpu::getTotalElemsPerThread(srcTy) /
+        std::accumulate(srcCTAShape.begin(), srcCTAShape.end(), 1,
+                        std::multiplies<>());
+
+    // 1. Process CTA tiles in the destination tensor according to the
+    // destination's linear layout order of CTA tiles.
+    // 2. For each tile position in the destination tensor, compute its
+    // corresponding position in the source tensor.
+    // 3. Copy the values from the source tile to the destination slice.
     SmallVector<Value> resultVals;
-    for (int i = skipElems; i < lastIdx; i += tensorStride) {
-      for (int j = 0; j < totalContigPerThread * CTASizes[order[0]]; ++j, ++i) {
-        assert(i < lastIdx);
-        resultVals.push_back(vals[i]);
+    for (size_t i = 0; i < numCTATiles; i++) {
+      auto coordInDstTensor =
+          mlir::LLVM::delinearize(i, dstCTAShape, dstCTAOrder);
+      auto coordInSrcTensor =
+          LLVM::AMD::multiDimElementwise<unsigned, unsigned>(
+              coordInDstTensor, firstTileCoordinate, std::plus<unsigned>());
+      auto linearIdxInSrcTensor =
+          mlir::LLVM::linearize(coordInSrcTensor, srcCTAShape, srcCTAOrder);
+
+      for (size_t j = 0; j < elemsPerThreadPerCTA; j++) {
+        resultVals.push_back(
+            vals[linearIdxInSrcTensor * elemsPerThreadPerCTA + j]);
       }
     }
     Value ret = packLLElements(loc, this->getTypeConverter(), resultVals,
-                               rewriter, resultTy);
+                               rewriter, dstTy);
 
     rewriter.replaceOp(op, ret);
     return success();
@@ -124,11 +126,7 @@ struct ExtractSliceOpConversion
   matchAndRewrite(amdgpu::ExtractSliceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto srcTy = op.getSource().getType();
-    if (isa<BlockedEncodingAttr, AMDMfmaEncodingAttr>(
-            op.getSource().getType().getEncoding())) {
-      return processLayout(op, adaptor, rewriter);
-    }
-    return failure();
+    return processLayout(op, adaptor, rewriter);
   }
 };
 } // namespace
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -755,4 +755,43 @@ void addLocalLoadNoAliasScope(AliasAnalysisOpInterface llLoadOp) {
   llLoadOp.setAliasScopes(aliasScopes);
 }
 
+SmallVector<unsigned> getCTATileOrder(MLIRContext *ctx,
+                                      const LinearLayout &layout) {
+  auto llEnc = triton::gpu::LinearEncodingAttr::get(ctx, layout);
+  auto regDim = StringAttr::get(ctx, "register");
+  auto &bases = layout.getBases().find(regDim)->second;
+
+  // Compute number of CTA tiles in a layout.
+  unsigned totalElems = layout.getTotalOutDimSize();
+  auto ctaShape = llEnc.getShapePerCTATile();
+  unsigned elemsPerCTA =
+      std::accumulate(ctaShape.begin(), ctaShape.end(), 1, std::multiplies<>());
+  assert((totalElems % elemsPerCTA) == 0 &&
+         "Total elements must be divisible by elemsPerCTA");
+  unsigned numCTAs = totalElems / elemsPerCTA;
+
+  // To determine the CTA tile order, start by identifying the register basis
+  // vector that corresponds to the first element of the second CTA tile. The
+  // nonzero index in the logical tensor it maps to indicates the most minor
+  // dimension. Then, for each subsequent basis register (first element of
+  // some CTA tile), extract the next nonzero index to build the full dimension
+  // order.
+  unsigned totalPerThread =
+      product(llEnc.basesPerDim(regDim, /*skipBroadcast=*/false)) / numCTAs;
+  unsigned startIndex = static_cast<unsigned>(std::log2(totalPerThread));
+
+  llvm::SmallSetVector<unsigned, 8> order;
+  for (unsigned i = startIndex; i < bases.size(); ++i) {
+    auto it = std::find_if(bases[i].begin(), bases[i].end(),
+                           [](unsigned v) { return v != 0; });
+    if (it != bases[i].end())
+      order.insert(std::distance(bases[i].begin(), it));
+  }
+
+  // Append any dims missing from our default order.
+  for (unsigned dim : llEnc.getOrder())
+    order.insert(dim);
+
+  return SmallVector<unsigned>(order.begin(), order.end());
+}
 } // namespace mlir::LLVM::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -137,6 +137,23 @@ void addLocalLoadNoAliasScope(AliasAnalysisOpInterface llLoadOp);
 // Attaches the "AsyncCopies" alias scope to llLoadDirectToLdsOp
 void addAsyncCopyAliasScope(AliasAnalysisOpInterface llLoadDirectToLdsOp);
 
+// Determine the order in which CTA tiles are laid out across the tensor.
+SmallVector<unsigned> getCTATileOrder(MLIRContext *ctx,
+                                      const LinearLayout &layout);
+
+template <typename T, typename U, typename BinaryOp>
+std::vector<unsigned> multiDimElementwise(const ArrayRef<T> &lhs,
+                                          const ArrayRef<U> &rhs, BinaryOp op) {
+  assert(lhs.size() == rhs.size() && "Input dimensions must match");
+  std::vector<unsigned> result;
+  result.reserve(lhs.size());
+  for (size_t i = 0, n = lhs.size(); i < n; ++i) {
+    unsigned a = static_cast<unsigned>(lhs[i]);
+    unsigned b = static_cast<unsigned>(rhs[i]);
+    result.push_back(op(a, b));
+  }
+  return result;
+}
 } // namespace mlir::LLVM::AMD
 
 #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_UTILITY_H_
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp
@@ -1518,6 +1518,12 @@ LogicalResult Pingponger::transformFP4(OpBuilder &builder, Location loc) {
   builder.setInsertionPointAfter(dotSOps[0]);
   if (sliceDotScaled(builder, loc, dotSOps[0], 4).failed())
     return failure();
+
+  if (genAsyncCopySlices(builder).failed()) {
+    LDBG("failed to slice global-to-local async copies");
+    return failure();
+  }
+
   updateOpInsertion(dotSliceOps[0]);
 
   appendOp(builder.create<ROCDL::SchedBarrier>(loc, 0));
@@ -1681,10 +1687,6 @@ void Pingponger::getDotPingponged() {
       return;
     }
 
-    if (llvm::failed(genAsyncCopySlices(builder))) {
-      LDBG("failed to slice global-to-local async copies");
-    }
-
     auto updateSignature = updateForOpSignature(builder);
     if (llvm::failed(updateSignature)) {
       LDBG("failed to update forOp signature");
@@ -1695,6 +1697,18 @@ void Pingponger::getDotPingponged() {
         LDBG("failed to update forOp signature");
       }
     }
+
+    forOp->walk([](ttg::AsyncCommitGroupOp groupOp) {
+      auto users = groupOp.getResult().getUsers();
+      if (users.empty()) {
+        SmallVector<Operation *> toDeleteVec;
+        for (auto token : groupOp.getInputTokens()) {
+          toDeleteVec.push_back(token.getDefiningOp());
+        }
+        groupOp->erase();
+        llvm::for_each(toDeleteVec, [](Operation *op) { op->erase(); });
+      }
+    });
     addAsymmetricSyncToLoop(builder, loc);
     return;
   }
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
@@ -1176,20 +1176,57 @@ class ConvertExtractSliceOp
     }
 
     Location loc = extractSliceOp->getLoc();
-
+    RankedTensorType resultType = extractSliceOp.getResult().getType();
     const FatPointers::FatPtrAttrs &fatPtrAttrs =
         fatPtrs.at({fatPtrBase, fatPtrOffset});
-    auto newSrc = createTensorPointer(rewriter, fatPtrBase, fatPtrOffset, loc,
-                                      fatPtrAttrs);
 
-    RankedTensorType resType = extractSliceOp.getResult().getType();
-    tt::amdgpu::ExtractSliceOp newExtractSliceOp =
+    Value newFatPtrOffset = nullptr;
+    auto origFatOffsetType = dyn_cast<RankedTensorType>(fatPtrOffset.getType());
+    auto slicedFatOffsetType = RankedTensorType::get(
+        resultType.getShape(), origFatOffsetType.getElementType(),
+        origFatOffsetType.getEncoding());
+
+    tt::amdgpu::ExtractSliceOp slicedFatPtrOffset =
         rewriter.create<tt::amdgpu::ExtractSliceOp>(
-            loc, Type{resType}, Value{newSrc},
+            loc, Type{slicedFatOffsetType}, Value{fatPtrOffset},
             extractSliceOp.getStaticOffsetsAttr());
-    rewriter.replaceOp(extractSliceOp, newExtractSliceOp);
-    fatPtrs[{fatPtrBase, newExtractSliceOp}] =
+
+    auto newResultPtrType =
+        RankedTensorType::get(resultType.getShape(), fatPtrBase.getType(),
+                              origFatOffsetType.getEncoding());
+
+    // Scalar case: we only need to `tt.addptr %basePtr, %offset`
+    if (!origFatOffsetType) {
+      auto addPtrOp = rewriter.create<tt::AddPtrOp>(
+          loc, newResultPtrType, fatPtrBase, slicedFatPtrOffset);
+      for (const auto &attribute : fatPtrAttrs.attributes)
+        addPtrOp->setAttr(attribute.getFirst(), attribute.getSecond());
+      newFatPtrOffset = addPtrOp.getResult();
+    }
+
+    // Tensor case: splat the scalar pointer and add the (tensor) offset:
+    // ```
+    //    %tensorBasePtr = tt.splat %basePtr
+    //    %tensorPtr = tt.addptr %tensorBasePtr, %offset
+    // ```
+    if (fatPtrAttrs.canNarrow)
+      fatPtrOffset = createTruncIOffset(rewriter, loc, fatPtrOffset,
+                                        rewriter.getI32Type());
+
+    tt::SplatOp tensorPtr =
+        rewriter.create<tt::SplatOp>(loc, newResultPtrType, fatPtrBase);
+    tt::AddPtrOp addPtrOp = rewriter.create<tt::AddPtrOp>(
+        loc, newResultPtrType, tensorPtr, slicedFatPtrOffset);
+
+    for (const auto &attribute : fatPtrAttrs.attributes)
+      addPtrOp->setAttr(attribute.getFirst(), attribute.getSecond());
+    newFatPtrOffset = addPtrOp.getResult();
+
+    assert(newFatPtrOffset);
+    rewriter.replaceOp(extractSliceOp, newFatPtrOffset);
+    fatPtrs[{fatPtrBase, newFatPtrOffset}] =
         fatPtrs.at({fatPtrBase, fatPtrOffset});
+
     return success();
   }
 };
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
@@ -200,6 +200,10 @@ bool verifyNonNegativeExpr(
             return verifyNonSmallerByAssumption(op.getLhs(), assumptions,
                                                 op.getRhs());
           })
+          .Case<triton::amdgpu::ExtractSliceOp>([&](auto op) {
+            return verifyNonNegativeExpr(op->getOperand(0), assumptions,
+                                         solver);
+          })
           .Default([&](Operation *) {
             // Conservatively assume that the expression is negative
             LDBG("  Unhandled op, cannot assume non-negative");