intel
diff --git a/‎bin/RegisterTritonDialects.h
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
Lines changed: 0 additions & 9 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
Lines changed: 0 additions & 9 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 9 additions & 9 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 9 additions & 9 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Utility.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Tools/GenericSwizzling.h
Lines changed: 37 additions & 5 deletions b/‎include/triton/Tools/GenericSwizzling.h
Lines changed: 37 additions & 5 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h
Lines changed: 2 additions & 1 deletion b/‎include/triton/Tools/LayoutUtils.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Analysis/Allocation.cpp
Lines changed: 7 additions & 16 deletions b/‎lib/Analysis/Allocation.cpp
Lines changed: 7 additions & 16 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Lines changed: 145 additions & 15 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Lines changed: 145 additions & 15 deletions
@@ -68,6 +68,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerConvertWarpSpecializeToLLVM();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
+  mlir::triton::registerAllocateSharedMemoryNvPass();
   mlir::registerLLVMDIScope();
 
   // TritonAMDGPUToLLVM passes
 
@@ -38,15 +38,6 @@ class TargetInfoBase {
                        pred);
   }
 
-  virtual bool canUseStMatrix(RankedTensorType tensorTy,
-                              ArrayRef<unsigned> repShape,
-                              ArrayRef<unsigned> paddedRepShape,
-                              ArrayRef<unsigned> order,
-                              int swizzleByteSize) const = 0;
-
-  virtual void storeMatrixShared(RewriterBase &rewriter, Location loc,
-                                 Value ptr, Value val) const = 0;
-
   virtual Value shuffleXor(RewriterBase &rewriter, Location loc, Value val,
                            int i) const = 0;
   virtual Value shuffleUp(RewriterBase &rewriter, Location loc, Value val,
 
@@ -10,6 +10,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
+#include "triton/Tools/GenericSwizzling.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/STLExtras.h"
@@ -321,6 +322,10 @@ namespace mlir {
 namespace triton {
 
 namespace gpu {
+
+std::pair<SmallVector<LocalMemOpTile>, SmallVector<LocalMemOpTile>>
+getSrcDstTiles(const TargetInfoBase &targetInfo, int bitwidth);
+
 Type getFunctionType(Type resultType, ValueRange operands);
 
 LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
@@ -608,10 +613,6 @@ std::optional<LLVM::AtomicBinOp> matchAtomicOp(RMWOp atomicOp);
 
 std::optional<LLVM::AtomicOrdering> getMemoryOrdering(MemSemantic memOrdering);
 
-bool isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
-                                ArrayRef<int64_t> allocShape,
-                                triton::gpu::SharedEncodingTrait sharedEnc);
-
 llvm::MapVector<StringAttr, int32_t> getAllFreeVarMasks(MLIRContext *ctx);
 
 llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks(Type type);
@@ -644,11 +645,10 @@ Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
                                  const LLVMTypeConverter *typeConverter,
                                  RewriterBase &rewriter);
 
-LogicalResult
-transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
-                             const TargetInfoBase &targetInfo,
-                             const LLVMTypeConverter *typeConverter,
-                             RewriterBase &rewriter);
+void transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
+                                  const TargetInfoBase &targetInfo,
+                                  const LLVMTypeConverter *typeConverter,
+                                  RewriterBase &rewriter);
 
 SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
                                     ArrayRef<Value> args,
 
@@ -7,6 +7,9 @@
 
 namespace mlir {
 
+// Bitwidth of pointers
+constexpr int kPtrBitWidth = 64;
+
 template <typename T, typename U> SmallVector<T> convertType(ArrayRef<U> in) {
   SmallVector<T> out;
   for (const auto &i : in)
@@ -186,6 +189,7 @@ bool isHostSideDescriptor(Value v);
 
 bool isKernel(FunctionOpInterface funcOp);
 
+unsigned getBitwidth(RankedTensorType ty);
 } // namespace triton
 } // namespace mlir
 
 
@@ -4,17 +4,49 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cstdint>
+#include <utility>
 
 namespace mlir::triton {
 class LinearLayout;
-}
+class TargetInfoBase;
+} // namespace mlir::triton
 
 namespace mlir::triton::gpu {
-LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
-                              int32_t bitwidth);
+// Store the lane indices that are used in the contiguous part
+// of an operation and in the address part.
+// The laneAddr part just represents the indices used in one wavefront
+// For now we just represent tiles with full vectorisation, meaning
+// ld.shared.b32.v4/st.shared.b32.v4
+// ldmatrix.v4 / stmatrix.v4
+// ldmatrix.trans.v4 / stmatrix.trans.v4
+struct LocalMemOpTile {
+  // If laneContig.size() < log2(128/bitwidth), we assume that
+  // the first log2(128/bitwidth) - laneContig.size() bases are registers
+  llvm::SmallVector<int32_t> laneContig;
+  // If laneAddr.size() < 3, we assume that the first
+  // 3 - laneAddr.size() bases are registers
+  llvm::SmallVector<int32_t> laneAddr;
+};
 
-std::pair<int, int> logBankConflicts(const LinearLayout &src,
-                                     const LinearLayout &dst,
+// Given a set of possible instructions given by
+// targetInfo.laneIdTiles(bitwidth) returns the optimal swizzling given these
+// instructions and a pair of indices into the ldStTiles that's needed to lower
+// this swizzling
+std::pair<LinearLayout, std::pair<int32_t, int32_t>>
+optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
+                 llvm::ArrayRef<LocalMemOpTile> srcTiles,
+                 llvm::ArrayRef<LocalMemOpTile> dstTiles, int32_t bitwidth);
+
+LinearLayout optimalSwizzlingLdSt(const LinearLayout &src,
+                                  const LinearLayout &dst, int32_t bitwidth);
+
+std::pair<int, int> logBankConflictsLdSt(const LinearLayout &src,
+                                         const LinearLayout &dst,
+                                         const LinearLayout &smem,
+                                         int32_t bitwidth);
+
+std::pair<int, int> logBankConflicts(llvm::ArrayRef<int32_t> tileSrc,
+                                     llvm::ArrayRef<int32_t> tileDst,
                                      const LinearLayout &smem,
                                      int32_t bitwidth);
 } // namespace mlir::triton::gpu
 
@@ -126,7 +126,8 @@ std::optional<ColumnAction> regPermForDivide(const LinearLayout &A,
 ColumnAction actionRemoveBroadcastedRegs(const LinearLayout &layout);
 
 std::pair<int64_t, ColumnAction>
-actionAdditiveStrides(const LinearLayout &layout, uint64_t maskSpanOffsets);
+actionAdditiveStrides(const LinearLayout &layout, const LinearLayout addrLayout,
+                      uint64_t maskSpanOffsets);
 
 // For a layout A with A.hasInDim(kReg), repeat the values so that they have
 // the same broadcasting as layout
 
@@ -29,14 +29,13 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 namespace triton {
 
-// Bitwidth of pointers
-constexpr int kPtrBitWidth = 64;
 // Max shmem LDS/STS instruction in bits
 constexpr int kMaxShmemVecBitLength = 128;
 
-static unsigned getBitwidth(RankedTensorType ty) {
-  auto isPtr = isa<PointerType>(ty.getElementType());
-  return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy) {
+  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+  return getNumScratchElements(scratchConfig.paddedRepShape);
 }
 
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
@@ -47,17 +46,11 @@ unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
   srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
   dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
   auto bitwidth = getBitwidth(srcTy);
-  auto smem = gpu::optimalSwizzling(srcLayout, dstLayout, bitwidth);
+  auto smem = gpu::optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
   auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
   return smem.getTotalOutDimSize() / reps;
 }
 
-unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
-                                     RankedTensorType dstTy) {
-  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-  return getNumScratchElements(scratchConfig.paddedRepShape);
-}
-
 static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
@@ -215,10 +208,8 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     auto dstTy = cvtLayout.getType();
     if (!cvtNeedsSharedMemory(srcTy, dstTy))
       return 0;
-    // Pesimistically take the max. We will revisit later
-    auto elems = std::max(getNumScratchElemsSwizzledCvt(srcTy, dstTy),
-                          getNumScratchElemsPaddedCvt(srcTy, dstTy));
-
+    // The generic pass uses swizzling
+    auto elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
     return elems * getBitwidth(srcTy) / 8;
   }
   if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
 
@@ -63,7 +63,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     } else if (llvm::is_contained(dims, kWarp)) {
       // Case 2: Transfer between values in the same CTA, in which case we move
       //         values through shared memory.
-      return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
+      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+      return success();
     } else if (llvm::is_contained(dims, kLane)) {
       // Case 3. Transfer between values in the same warp, in which case we try
       //         to move values using warp shuffles, though if the pattern is
@@ -74,7 +75,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       // TODO: Since data is only transferred within a warp over shared memory,
       // we should use `bar.warp.sync` instead of `barrier`, which will improve
       // latency when warps issue barriers on different cycles.
-      return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
+      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+      return success();
     } else if (llvm::is_contained(dims, kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
       //         simply reorder the elements of adaptor.getSrc().
@@ -110,24 +112,152 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     return success();
   }
 
-  LogicalResult transferWithinBlock(ConvertLayoutOp op,
-                                    const LinearLayout &srcLayout,
-                                    const LinearLayout &dstLayout,
-                                    OpAdaptor adaptor,
-                                    ConversionPatternRewriter &rewriter) const {
-    assert(cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
+  SmallVector<Value> transferWithinBlockSwizzlingImpl(
+      Location loc, ConversionPatternRewriter &rewriter,
+      const LinearLayout &srcLayout, const LinearLayout &dstLayout,
+      ArrayRef<Value> inVals, Type llvmElemTy, Value smemBase) const {
+    auto *ctx = rewriter.getContext();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    // We handle transformations recursively as they all need a preprocessing
+    // and a postprocessing step.
+
+    // Handle pointer types as 64-bit integers
+    if (isa<LLVM::LLVMPointerType>(llvmElemTy)) {
+      auto llvmElemTyPtr = i64_ty;
+      auto newInVals = llvm::to_vector(llvm::map_range(inVals, [&](Value v) {
+        return b.ptrtoint(llvmElemTyPtr, v).getResult();
+      }));
+      auto outVals =
+          transferWithinBlockSwizzlingImpl(loc, rewriter, srcLayout, dstLayout,
+                                           newInVals, llvmElemTyPtr, smemBase);
+      for (auto &v : outVals) {
+        v = b.inttoptr(llvmElemTy, v);
+      }
+      return outVals;
+    }
 
-    // Try to use swizzling to implement the conversion
-    if (succeeded(transferWithinBlockSwizzling(op, adaptor.getSrc(), targetInfo,
-                                               getTypeConverter(), rewriter))) {
-      return success();
+    // Handle sub-byte elements like i1
+    if (llvmElemTy.getIntOrFloatBitWidth() < 8) {
+      // Upcast to i8
+      auto i8ElemTy = i8_ty;
+      auto newInVals = llvm::to_vector(llvm::map_range(
+          inVals, [&](Value v) { return b.zext(i8ElemTy, v).getResult(); }));
+      auto outVals = transferWithinBlockSwizzlingImpl(
+          loc, rewriter, srcLayout, dstLayout, newInVals, i8ElemTy, smemBase);
+      for (auto &v : outVals) {
+        v = b.trunc(llvmElemTy, v);
+      }
+      return outVals;
     }
 
-    Value result = transferWithinBlockPadding(op, adaptor.getSrc(), targetInfo,
-                                              getTypeConverter(), rewriter);
+    // Remove broadcasting in src
+    auto removeBroadcastSrc = actionRemoveBroadcastedRegs(srcLayout);
+    if (!removeBroadcastSrc.isIdentity()) {
+      auto prmtSrc = removeBroadcastSrc.apply(srcLayout);
+      auto newInVals = removeBroadcastSrc.apply(inVals);
+      return transferWithinBlockSwizzlingImpl(loc, rewriter, prmtSrc, dstLayout,
+                                              newInVals, llvmElemTy, smemBase);
+    }
 
+    // Remove broadcasting in dst
+    auto removeBroadcastDst = actionRemoveBroadcastedRegs(dstLayout);
+    if (!removeBroadcastDst.isIdentity()) {
+      auto prmtDst = removeBroadcastDst.apply(dstLayout);
+      auto outVals = transferWithinBlockSwizzlingImpl(
+          loc, rewriter, srcLayout, prmtDst, inVals, llvmElemTy, smemBase);
+      return broadcastAs(outVals, dstLayout);
+    }
+
+    // At this point we have a type that's at least 8-bit
+    // and we don't have broadcasting in the registers
+    auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
+    auto smem = optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
+
+    // Extract reps from smem
+    auto kReg = str_attr("register");
+    auto kReps = str_attr("reps");
+    auto nReps = smem.getInDimSize(kReps);
+    auto reps = LinearLayout::identity1D(nReps, kReg, kReps);
+
+    auto totalStoreCvt = srcLayout.invertAndCompose(smem);
+    auto totalLoadCvt = dstLayout.invertAndCompose(smem);
+
+    // The permutation exists by construction of the reps dimension in
+    // optimalSwizzling
+    auto permStore =
+        regPermForDivide(totalStoreCvt, reps, /*left=*/false).value();
+    totalStoreCvt = permStore.apply(totalStoreCvt);
+    auto permutedInVals = permStore.apply(inVals);
+    auto permLoad =
+        regPermForDivide(totalLoadCvt, reps, /*left=*/false).value();
+    totalLoadCvt = permLoad.apply(totalLoadCvt);
+
+    // Remove the reps and flatten into offset
+    auto storeCvt = *divideRight(totalStoreCvt, reps);
+    auto loadCvt = *divideRight(totalLoadCvt, reps);
+    auto kOffset = str_attr("offset");
+    storeCvt = storeCvt.reshapeOuts({{kOffset, storeCvt.getTotalOutDimSize()}});
+    loadCvt = loadCvt.reshapeOuts({{kOffset, loadCvt.getTotalOutDimSize()}});
+
+    auto tileSize = storeCvt.getInDimSize(kReg);
+
+    assert(permutedInVals.size() == tileSize * nReps);
+    SmallVector<Value> outVals;
+    auto affineOffset = b.i32_val(0);
+    auto maskSpanAffineOffset = 0;
+    auto noPaddingOffset = [](Value v) { return v; };
+    for (int i = 0; i < nReps; ++i) {
+      if (i > 0)
+        b.barrier();
+
+      auto tileInVals =
+          ArrayRef<Value>(permutedInVals).slice(i * tileSize, tileSize);
+      // Store
+      lowerLdStShared(loc, ctx, storeCvt, tileInVals, llvmElemTy, smemBase,
+                      noPaddingOffset, affineOffset, maskSpanAffineOffset,
+                      rewriter, targetInfo);
+      b.barrier();
+      // Load
+      SmallVector<Value> tileOutVals = lowerLdStShared(
+          loc, ctx, loadCvt, {}, llvmElemTy, smemBase, noPaddingOffset,
+          affineOffset, maskSpanAffineOffset, rewriter, targetInfo);
+      llvm::append_range(outVals, tileOutVals);
+    }
+
+    // Undo the permLoad used to divideRight
+    outVals = permLoad.inverse().apply(outVals);
+    return outVals;
+  }
+
+  void transferWithinBlockSwizzling(ConvertLayoutOp op, Value src,
+                                    ConversionPatternRewriter &rewriter) const {
+    auto loc = op.getLoc();
+    auto *ctx = op.getContext();
+    auto srcTy = op.getSrc().getType();
+    auto dstTy = op.getType();
+
+    // Remove the kBlock dimension from the layout as it's the identity in the
+    // cvt
+    auto srcLayout = toLinearLayout(srcTy);
+    auto dstLayout = toLinearLayout(dstTy);
+    auto kReg = str_attr("register");
+    auto kLane = str_attr("lane");
+    auto kWarp = str_attr("warp");
+    srcLayout = srcLayout.sublayout({kReg, kLane, kWarp},
+                                    to_vector(srcLayout.getOutDimNames()));
+    dstLayout = dstLayout.sublayout({kReg, kLane, kWarp},
+                                    to_vector(dstLayout.getOutDimNames()));
+
+    auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
+    auto smemBase =
+        LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op.getOperation());
+    auto inVals = unpackLLElements(loc, src, rewriter);
+    auto outVals = transferWithinBlockSwizzlingImpl(
+        loc, rewriter, srcLayout, dstLayout, inVals, llvmElemTy, smemBase);
+
+    Value result =
+        packLLElements(loc, getTypeConverter(), outVals, rewriter, dstTy);
     rewriter.replaceOp(op, result);
-    return success();
   }
 
   // Use warp shuffles to implement a layout conversion where data only needs to