intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
Lines changed: 1 addition & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Tools/GenericSwizzling.h
Lines changed: 22 additions & 0 deletions b/‎include/triton/Tools/GenericSwizzling.h
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h
Lines changed: 7 additions & 4 deletions b/‎include/triton/Tools/LayoutUtils.h
Lines changed: 7 additions & 4 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h
Lines changed: 10 additions & 1 deletion b/‎include/triton/Tools/LinearLayout.h
Lines changed: 10 additions & 1 deletion
diff --git a/‎lib/Analysis/Allocation.cpp
Lines changed: 35 additions & 22 deletions b/‎lib/Analysis/Allocation.cpp
Lines changed: 35 additions & 22 deletions
@@ -96,6 +96,7 @@ class TargetInfoBase {
 
   virtual bool supportLdMatrix() const { return false; }
   virtual bool supportStMatrix() const { return false; }
+  virtual bool isCuda() const { return false; }
 
   // Annotate target specific information to local store operations during
   // lowering to LLVM.
 
@@ -0,0 +1,22 @@
+#ifndef TRITON_GENERIC_SWIZZLING_H
+#define TRITON_GENERIC_SWIZZLING_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
+
+namespace mlir::triton {
+class LinearLayout;
+}
+
+namespace mlir::triton::gpu {
+LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
+                              int32_t bitwidth);
+
+std::pair<int, int> logBankConflicts(const LinearLayout &src,
+                                     const LinearLayout &dst,
+                                     const LinearLayout &smem,
+                                     int32_t bitwidth);
+} // namespace mlir::triton::gpu
+
+#endif // TRITON_GENERIC_SWIZZLING_H
@@ -116,15 +116,18 @@ LinearLayout zerosLike(const LinearLayout &layout);
 // For a layout A with A.hasInDim(kReg), find a permutation of registers action
 // such that action.apply(A) may be divisible by B
 // It's not always true that the action returned by this function will
-// allow us to divideLeft, but it is true that if it if there exists one, it is
-// the one returned by this function.
-std::optional<ColumnAction> regPermForDivideLeft(const LinearLayout &A,
-                                                 const LinearLayout &B);
+// allow us to divideLeft (resp. divideRight), but it is true that if it if
+// there exists one, it is the one returned by this function.
+std::optional<ColumnAction> regPermForDivide(const LinearLayout &A,
+                                             const LinearLayout &B, bool left);
 
 // For a layout A with A.hasInDim(kReg), find a permutation of registers action
 // such that action.apply(A) has the broadcasted registers removed
 ColumnAction actionRemoveBroadcastedRegs(const LinearLayout &layout);
 
+std::pair<int64_t, ColumnAction>
+actionAdditiveStrides(const LinearLayout &layout);
+
 // For a layout A with A.hasInDim(kReg), repeat the values so that they have
 // the same broadcasting as layout
 SmallVector<Value> broadcastAs(const SmallVector<Value> &values,
 
@@ -453,6 +453,11 @@ class LinearLayout {
   auto getOutDimNames() const { return llvm::make_first_range(outDims); }
   auto getOutDimSizes() const { return llvm::make_second_range(outDims); }
 
+  // Relevant for reshaping
+  SmallVector<std::pair<StringAttr, int32_t>> getOutDims() const {
+    return to_vector(outDims);
+  }
+
   // Gets the position that this outDim occupies in getOutDimNames().  Asserts
   // if the dim is not present.
   int32_t getOutDimIndex(StringAttr outDim) const;
@@ -620,6 +625,7 @@ class LinearLayout {
 
   // Compute a C such that A = B * C if it exists.
   // In other words, C = B^{-1} * A.
+  // For divideRight, we compute A = C * B, that is, C = A * B^{-1}.
   // Note that such a C exists iff (every pair of input/output dim of) A is
   // of the form
   // [[B, 0],
@@ -633,6 +639,8 @@ class LinearLayout {
   // same dimensions as A ensures that C is well-defined.
   friend std::optional<LinearLayout> divideLeft(const LinearLayout &A,
                                                 const LinearLayout &B);
+  friend std::optional<LinearLayout> divideRight(const LinearLayout &A,
+                                                 const LinearLayout &B);
 
   // Returns true if this layout acts trivially (as the identity) on the given
   // dimensions. This means that it's the identity on those dimensions, and it
@@ -798,9 +806,10 @@ class ColumnAction {
   SmallVector<size_t> action;
   StringAttr inDim;
   size_t inSizeLog2;
-  bool isIdentity;
+  bool isIdentity = true;
 
 public:
+  ColumnAction() = default;
   ColumnAction(ArrayRef<size_t> action, StringAttr inDim, size_t inSizeLog2)
       : action(action), inDim(inDim), inSizeLog2(inSizeLog2) {
     auto it = llvm::max_element(action);
 
@@ -10,6 +10,8 @@
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Tools/GenericSwizzling.h"
+#include "triton/Tools/LayoutUtils.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,6 +34,30 @@ constexpr int kPtrBitWidth = 64;
 // Max shmem LDS/STS instruction in bits
 constexpr int kMaxShmemVecBitLength = 128;
 
+static unsigned getBitwidth(RankedTensorType ty) {
+  auto isPtr = isa<PointerType>(ty.getElementType());
+  return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
+}
+
+static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                              RankedTensorType dstTy) {
+  auto *ctx = srcTy.getContext();
+  auto srcLayout = gpu::toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
+  auto dstLayout = gpu::toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+  srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
+  dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
+  auto bitwidth = getBitwidth(srcTy);
+  auto smem = gpu::optimalSwizzling(srcLayout, dstLayout, bitwidth);
+  auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
+  return smem.getTotalOutDimSize() / reps;
+}
+
+static unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                            RankedTensorType dstTy) {
+  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+  return getNumScratchElements(scratchConfig.paddedRepShape);
+}
+
 static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
@@ -135,12 +161,8 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
   scratchConfig.outVec = std::min(scratchConfig.outVec, contiguousShapeDim);
   // Clamp the vector length to kMaxShmemVecBitLength / element bitwidth as this
   // is the max vectorisation
-  auto inBitWidth = isa<PointerType>(srcTy.getElementType())
-                        ? kPtrBitWidth
-                        : srcTy.getElementTypeBitWidth();
-  auto outBitWidth = isa<PointerType>(dstTy.getElementType())
-                         ? kPtrBitWidth
-                         : dstTy.getElementTypeBitWidth();
+  auto inBitWidth = getBitwidth(srcTy);
+  auto outBitWidth = getBitwidth(dstTy);
   scratchConfig.inVec =
       std::min(scratchConfig.inVec, kMaxShmemVecBitLength / inBitWidth);
   scratchConfig.outVec =
@@ -174,27 +196,18 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
         op->getParentOfType<ModuleOp>());
     return std::max<int>(dstTy.getNumElements(), threadsPerWarp) *
-           std::max<int>(8, dstTy.getElementTypeBitWidth()) / 8;
+           getBitwidth(dstTy) / 8;
   }
   if (auto cvtLayout = dyn_cast<gpu::ConvertLayoutOp>(op)) {
     auto srcTy = cvtLayout.getSrc().getType();
     auto dstTy = cvtLayout.getType();
-    auto srcEncoding = srcTy.getEncoding();
-    auto dstEncoding = dstTy.getEncoding();
-    if (mlir::isa<gpu::SharedEncodingTrait>(srcEncoding) ||
-        mlir::isa<gpu::SharedEncodingTrait>(dstEncoding)) {
-      // Conversions from/to shared memory do not need scratch memory.
+    if (!cvtNeedsSharedMemory(srcTy, dstTy))
       return 0;
-    }
-    // ConvertLayoutOp with both input/output non-shared_layout
-    // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
-    //       also possible to realize it with other approaches in restricted
-    //       conditions, such as warp-shuffle
-    auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-    auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
-    return isa<PointerType>(srcTy.getElementType())
-               ? elems * kPtrBitWidth / 8
-               : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
+    // Pesimistically take the max. We will revisit later
+    auto elems = std::max(getNumScratchElemsSwizzledCvt(srcTy, dstTy),
+                          getNumScratchElemsPaddedCvt(srcTy, dstTy));
+
+    return elems * getBitwidth(srcTy) / 8;
   }
   if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
     auto value = op->getOperand(0);