intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 1 deletion b/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 4 additions & 5 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 0 additions & 4 deletions b/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Tools/GenericSwizzling.h‎
Lines changed: 5 additions & 37 deletions b/‎include/triton/Tools/GenericSwizzling.h‎
Lines changed: 5 additions & 37 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 16 additions & 7 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 40 additions & 8 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 40 additions & 8 deletions
@@ -89,7 +89,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerConvertWarpSpecializeToLLVM();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
-  mlir::triton::registerAllocateSharedMemoryNvPass();
   mlir::registerLLVMDIScope();
   mlir::triton::gpu::intel::registerTritonAnnotateModulePass();
   mlir::triton::gpu::intel::registerTritonIntelGPUPasses();
 
@@ -38,6 +38,15 @@ class TargetInfoBase {
                        pred);
   }
 
+  virtual bool canUseStMatrix(RankedTensorType tensorTy,
+                              ArrayRef<unsigned> repShape,
+                              ArrayRef<unsigned> paddedRepShape,
+                              ArrayRef<unsigned> order,
+                              int swizzleByteSize) const = 0;
+
+  virtual void storeMatrixShared(RewriterBase &rewriter, Location loc,
+                                 Value ptr, Value val) const = 0;
+
   virtual Value shuffleXor(RewriterBase &rewriter, Location loc, Value val,
                            int i) const = 0;
   virtual Value shuffleUp(RewriterBase &rewriter, Location loc, Value val,
 
@@ -10,7 +10,6 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
-#include "triton/Tools/GenericSwizzling.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/STLExtras.h"
@@ -322,10 +321,6 @@ namespace mlir {
 namespace triton {
 
 namespace gpu {
-
-std::pair<SmallVector<LocalMemOpTile>, SmallVector<LocalMemOpTile>>
-getSrcDstTiles(const TargetInfoBase &targetInfo, int bitwidth);
-
 Type getFunctionType(Type resultType, ValueRange operands);
 
 LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
@@ -612,6 +607,10 @@ std::optional<LLVM::AtomicBinOp> matchAtomicOp(RMWOp atomicOp);
 
 std::optional<LLVM::AtomicOrdering> getMemoryOrdering(MemSemantic memOrdering);
 
+bool isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
+                                ArrayRef<int64_t> allocShape,
+                                triton::gpu::SharedEncodingTrait sharedEnc);
+
 llvm::MapVector<StringAttr, int32_t> getAllFreeVarMasks(MLIRContext *ctx);
 
 llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks(Type type);
 
@@ -7,9 +7,6 @@
 
 namespace mlir {
 
-// Bitwidth of pointers
-constexpr int kPtrBitWidth = 64;
-
 template <typename T, typename U> SmallVector<T> convertType(ArrayRef<U> in) {
   SmallVector<T> out;
   for (const auto &i : in)
@@ -189,7 +186,6 @@ bool isHostSideDescriptor(Value v);
 
 bool isKernel(FunctionOpInterface funcOp);
 
-unsigned getBitwidth(RankedTensorType ty);
 } // namespace triton
 } // namespace mlir
 
 
@@ -4,49 +4,17 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cstdint>
-#include <utility>
 
 namespace mlir::triton {
 class LinearLayout;
-class TargetInfoBase;
-} // namespace mlir::triton
+}
 
 namespace mlir::triton::gpu {
-// Store the lane indices that are used in the contiguous part
-// of an operation and in the address part.
-// The laneAddr part just represents the indices used in one wavefront
-// For now we just represent tiles with full vectorisation, meaning
-// ld.shared.b32.v4/st.shared.b32.v4
-// ldmatrix.v4 / stmatrix.v4
-// ldmatrix.trans.v4 / stmatrix.trans.v4
-struct LocalMemOpTile {
-  // If laneContig.size() < log2(128/bitwidth), we assume that
-  // the first log2(128/bitwidth) - laneContig.size() bases are registers
-  llvm::SmallVector<int32_t> laneContig;
-  // If laneAddr.size() < 3, we assume that the first
-  // 3 - laneAddr.size() bases are registers
-  llvm::SmallVector<int32_t> laneAddr;
-};
+LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
+                              int32_t bitwidth);
 
-// Given a set of possible instructions given by
-// targetInfo.laneIdTiles(bitwidth) returns the optimal swizzling given these
-// instructions and a pair of indices into the ldStTiles that's needed to lower
-// this swizzling
-std::pair<LinearLayout, std::pair<int32_t, int32_t>>
-optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
-                 llvm::ArrayRef<LocalMemOpTile> srcTiles,
-                 llvm::ArrayRef<LocalMemOpTile> dstTiles, int32_t bitwidth);
-
-LinearLayout optimalSwizzlingLdSt(const LinearLayout &src,
-                                  const LinearLayout &dst, int32_t bitwidth);
-
-std::pair<int, int> logBankConflictsLdSt(const LinearLayout &src,
-                                         const LinearLayout &dst,
-                                         const LinearLayout &smem,
-                                         int32_t bitwidth);
-
-std::pair<int, int> logBankConflicts(llvm::ArrayRef<int32_t> tileSrc,
-                                     llvm::ArrayRef<int32_t> tileDst,
+std::pair<int, int> logBankConflicts(const LinearLayout &src,
+                                     const LinearLayout &dst,
                                      const LinearLayout &smem,
                                      int32_t bitwidth);
 } // namespace mlir::triton::gpu
 
@@ -126,8 +126,7 @@ std::optional<ColumnAction> regPermForDivide(const LinearLayout &A,
 ColumnAction actionRemoveBroadcastedRegs(const LinearLayout &layout);
 
 std::pair<int64_t, ColumnAction>
-actionAdditiveStrides(const LinearLayout &layout, const LinearLayout addrLayout,
-                      uint64_t maskSpanOffsets);
+actionAdditiveStrides(const LinearLayout &layout, uint64_t maskSpanOffsets);
 
 // For a layout A with A.hasInDim(kReg), repeat the values so that they have
 // the same broadcasting as layout
 
@@ -29,13 +29,14 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 namespace triton {
 
+// Bitwidth of pointers
+constexpr int kPtrBitWidth = 64;
 // Max shmem LDS/STS instruction in bits
 constexpr int kMaxShmemVecBitLength = 128;
 
-unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
-                                     RankedTensorType dstTy) {
-  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-  return getNumScratchElements(scratchConfig.paddedRepShape);
+static unsigned getBitwidth(RankedTensorType ty) {
+  auto isPtr = isa<PointerType>(ty.getElementType());
+  return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
 }
 
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
@@ -46,11 +47,17 @@ unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
   srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
   dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
   auto bitwidth = getBitwidth(srcTy);
-  auto smem = gpu::optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
+  auto smem = gpu::optimalSwizzling(srcLayout, dstLayout, bitwidth);
   auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
   return smem.getTotalOutDimSize() / reps;
 }
 
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy) {
+  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+  return getNumScratchElements(scratchConfig.paddedRepShape);
+}
+
 static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
@@ -208,8 +215,10 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     auto dstTy = cvtLayout.getType();
     if (!cvtNeedsSharedMemory(srcTy, dstTy))
       return 0;
-    // The generic pass uses swizzling
-    auto elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
+    // Pesimistically take the max. We will revisit later
+    auto elems = std::max(getNumScratchElemsSwizzledCvt(srcTy, dstTy),
+                          getNumScratchElemsPaddedCvt(srcTy, dstTy));
+
     return elems * getBitwidth(srcTy) / 8;
   }
   if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
 
@@ -63,8 +63,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     } else if (llvm::is_contained(dims, kWarp)) {
       // Case 2: Transfer between values in the same CTA, in which case we move
       //         values through shared memory.
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
-      return success();
+      return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
     } else if (llvm::is_contained(dims, kLane)) {
       // Case 3. Transfer between values in the same warp, in which case we try
       //         to move values using warp shuffles, though if the pattern is
@@ -75,8 +74,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       // TODO: Since data is only transferred within a warp over shared memory,
       // we should use `bar.warp.sync` instead of `barrier`, which will improve
       // latency when warps issue barriers on different cycles.
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
-      return success();
+      return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
     } else if (llvm::is_contained(dims, kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
       //         simply reorder the elements of adaptor.getSrc().
@@ -171,7 +169,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     // At this point we have a type that's at least 8-bit
     // and we don't have broadcasting in the registers
     auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
-    auto smem = optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
+    auto smem = optimalSwizzling(srcLayout, dstLayout, bitwidth);
 
     // Extract reps from smem
     auto kReg = str_attr("register");
@@ -203,9 +201,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
 
     assert(permutedInVals.size() == tileSize * nReps);
     SmallVector<Value> outVals;
+    auto noPaddingOffset = [](Value v) { return v; };
     auto affineOffset = b.i32_val(0);
     auto maskSpanAffineOffset = 0;
-    auto noPaddingOffset = [](Value v) { return v; };
     for (int i = 0; i < nReps; ++i) {
       if (i > 0)
         b.barrier();
@@ -229,8 +227,20 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     return outVals;
   }
 
-  void transferWithinBlockSwizzling(ConvertLayoutOp op, Value src,
-                                    ConversionPatternRewriter &rewriter) const {
+  LogicalResult
+  transferWithinBlockSwizzling(ConvertLayoutOp op, Value src,
+                               ConversionPatternRewriter &rewriter) const {
+    // Fallback for now to standard lowering if it can use stmatrix
+    auto scratchConfig =
+        getScratchConfigForCvt(op.getSrc().getType(), op.getType());
+    bool isStMatrix = targetInfo.canUseStMatrix(
+        op.getSrc().getType(), scratchConfig.repShape,
+        scratchConfig.paddedRepShape, scratchConfig.order,
+        /*swizzleByteSize=*/0);
+    if (isStMatrix) {
+      return failure();
+    }
+
     auto loc = op.getLoc();
     auto *ctx = op.getContext();
     auto srcTy = op.getSrc().getType();
@@ -258,6 +268,28 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     Value result =
         packLLElements(loc, getTypeConverter(), outVals, rewriter, dstTy);
     rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  LogicalResult transferWithinBlock(ConvertLayoutOp op,
+                                    const LinearLayout &srcLayout,
+                                    const LinearLayout &dstLayout,
+                                    OpAdaptor adaptor,
+                                    ConversionPatternRewriter &rewriter) const {
+    assert(cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
+
+    // Try to use swizzling to implement the conversion
+    // HACK Remove once AMD tests pass for the swizzling path
+    if (targetInfo.isCuda() && succeeded(transferWithinBlockSwizzling(
+                                   op, adaptor.getSrc(), rewriter))) {
+      return success();
+    }
+
+    Value result = transferWithinBlockPadding(op, adaptor.getSrc(), targetInfo,
+                                              getTypeConverter(), rewriter);
+
+    rewriter.replaceOp(op, result);
+    return success();
   }
 
   // Use warp shuffles to implement a layout conversion where data only needs to