intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 0 deletions b/‎Makefile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 9 additions & 9 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎include/triton/Tools/GenericSwizzling.h‎
Lines changed: 22 additions & 0 deletions b/‎include/triton/Tools/GenericSwizzling.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 7 additions & 4 deletions b/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 10 additions & 1 deletion b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 35 additions & 22 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 35 additions & 22 deletions
@@ -116,6 +116,12 @@ jobs:
             TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
           fi
 
+          # Run tests under triton/python/triton_kernels/tests/ on gfx950 and gfx942
+          if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ] || [ "${{ matrix.runner[0] }}" = "amd-gfx942" ]; then
+            cd ../../triton_kernels/
+            python3 -m pytest -s -n 12 tests/
+          fi
+
       - name: Run asan tests on AMD
         if: false
         run: |
 
@@ -39,6 +39,7 @@ test-unit: all
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
 	$(PYTEST) -vs python/tutorials/06-fused-attention.py
+	$(PYTEST) -vs python/tutorials/gluon/01-attention-forward.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
@@ -96,6 +96,7 @@ class TargetInfoBase {
 
   virtual bool supportLdMatrix() const { return false; }
   virtual bool supportStMatrix() const { return false; }
+  virtual bool isCuda() const { return false; }
 
   // Annotate target specific information to local store operations during
   // lowering to LLVM.
 
@@ -1143,11 +1143,10 @@ Row |
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<int64_t> getElemsPerInstrForOperands() const;
+    SmallVector<int64_t> getElemsPerInstrForOperands(int kDim, int opIdx) const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
-                                          Type elemType, int kWidth, int opIdx) const;
+                                          Type elemType, int kWidth, int kDim, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
-    unsigned getKWidthForOperands() const;
     static SmallVector<unsigned> getMNKDimPerInstr();
   }];
 }
 
@@ -45,7 +45,7 @@ class CoarseSchedule {
     const_iterator begin() const { return orderClusters.begin(); }
     iterator end() { return orderClusters.end(); }
     const_iterator end() const { return orderClusters.end(); }
-    size_t size() { return orderClusters.size(); }
+    size_t size() const { return orderClusters.size(); }
     iterator newAtBack() {
       orderClusters.push_back(orderClusters.size());
       return std::prev(orderClusters.end());
@@ -88,7 +88,7 @@ class CoarseSchedule {
   DenseMap<Operation *, std::pair<int, Cluster>> opToStageAndCluster;
 
   void setNumStages(int numStages) { this->numStages = numStages; }
-  int getNumStages() { return numStages; }
+  int getNumStages() const { return numStages; }
 
   void insert(Operation *op, int stage, Cluster cluster) {
     if (stage >= numStages) {
@@ -115,7 +115,7 @@ class CoarseSchedule {
 
   void erase(Operation *op) { opToStageAndCluster.erase(op); }
 
-  int count(Operation *op) { return opToStageAndCluster.count(op); }
+  int count(Operation *op) const { return opToStageAndCluster.count(op); }
 
   std::pair<int, Cluster> operator[](Operation *op) {
     return opToStageAndCluster[op];
@@ -129,25 +129,25 @@ class CoarseSchedule {
   Cluster splitClusterBefore(Operation *op, scf::ForOp forOp);
 
   // Check if op a will show up before op b in the final unrolled code.
-  bool isOpBefore(Operation *a, Operation *b);
+  bool isOpBefore(Operation *a, Operation *b) const;
 
   // Check if op a is in earlier cluster than op b.
-  bool isOpInEarlierCluster(Operation *a, Operation *b);
+  bool isOpInEarlierCluster(Operation *a, Operation *b) const;
 
   // Check if op a is in the same cluster as op b.
-  bool isOpInSameCluster(Operation *a, Operation *b);
+  bool isOpInSameCluster(Operation *a, Operation *b) const;
 
   SmallVector<std::tuple<Operation *, int, Cluster>>
-  getOpsInOrder(scf::ForOp forOp);
+  getOpsInOrder(scf::ForOp forOp) const;
   std::vector<std::pair<Operation *, unsigned>>
-  createFinalSchedule(scf::ForOp forOp);
+  createFinalSchedule(scf::ForOp forOp) const;
 
   bool empty() const { return opToStageAndCluster.size() == 0; }
   auto end() const { return opToStageAndCluster.end(); }
   auto begin() const { return opToStageAndCluster.begin(); }
 
   // Set <stage, cluster> based on CoarseSchedule.
-  void serialize(scf::ForOp &forOp);
+  void serialize(scf::ForOp &forOp) const;
   // Create a CoarseSchedule based on forOp's <stage, cluster>.
   LogicalResult deSerialize(scf::ForOp &forOp);
 
 
@@ -0,0 +1,22 @@
+#ifndef TRITON_GENERIC_SWIZZLING_H
+#define TRITON_GENERIC_SWIZZLING_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
+
+namespace mlir::triton {
+class LinearLayout;
+}
+
+namespace mlir::triton::gpu {
+LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
+                              int32_t bitwidth);
+
+std::pair<int, int> logBankConflicts(const LinearLayout &src,
+                                     const LinearLayout &dst,
+                                     const LinearLayout &smem,
+                                     int32_t bitwidth);
+} // namespace mlir::triton::gpu
+
+#endif // TRITON_GENERIC_SWIZZLING_H
@@ -116,15 +116,18 @@ LinearLayout zerosLike(const LinearLayout &layout);
 // For a layout A with A.hasInDim(kReg), find a permutation of registers action
 // such that action.apply(A) may be divisible by B
 // It's not always true that the action returned by this function will
-// allow us to divideLeft, but it is true that if it if there exists one, it is
-// the one returned by this function.
-std::optional<ColumnAction> regPermForDivideLeft(const LinearLayout &A,
-                                                 const LinearLayout &B);
+// allow us to divideLeft (resp. divideRight), but it is true that if it if
+// there exists one, it is the one returned by this function.
+std::optional<ColumnAction> regPermForDivide(const LinearLayout &A,
+                                             const LinearLayout &B, bool left);
 
 // For a layout A with A.hasInDim(kReg), find a permutation of registers action
 // such that action.apply(A) has the broadcasted registers removed
 ColumnAction actionRemoveBroadcastedRegs(const LinearLayout &layout);
 
+std::pair<int64_t, ColumnAction>
+actionAdditiveStrides(const LinearLayout &layout);
+
 // For a layout A with A.hasInDim(kReg), repeat the values so that they have
 // the same broadcasting as layout
 SmallVector<Value> broadcastAs(const SmallVector<Value> &values,
 
@@ -453,6 +453,11 @@ class LinearLayout {
   auto getOutDimNames() const { return llvm::make_first_range(outDims); }
   auto getOutDimSizes() const { return llvm::make_second_range(outDims); }
 
+  // Relevant for reshaping
+  SmallVector<std::pair<StringAttr, int32_t>> getOutDims() const {
+    return to_vector(outDims);
+  }
+
   // Gets the position that this outDim occupies in getOutDimNames().  Asserts
   // if the dim is not present.
   int32_t getOutDimIndex(StringAttr outDim) const;
@@ -620,6 +625,7 @@ class LinearLayout {
 
   // Compute a C such that A = B * C if it exists.
   // In other words, C = B^{-1} * A.
+  // For divideRight, we compute A = C * B, that is, C = A * B^{-1}.
   // Note that such a C exists iff (every pair of input/output dim of) A is
   // of the form
   // [[B, 0],
@@ -633,6 +639,8 @@ class LinearLayout {
   // same dimensions as A ensures that C is well-defined.
   friend std::optional<LinearLayout> divideLeft(const LinearLayout &A,
                                                 const LinearLayout &B);
+  friend std::optional<LinearLayout> divideRight(const LinearLayout &A,
+                                                 const LinearLayout &B);
 
   // Returns true if this layout acts trivially (as the identity) on the given
   // dimensions. This means that it's the identity on those dimensions, and it
@@ -798,9 +806,10 @@ class ColumnAction {
   SmallVector<size_t> action;
   StringAttr inDim;
   size_t inSizeLog2;
-  bool isIdentity;
+  bool isIdentity = true;
 
 public:
+  ColumnAction() = default;
   ColumnAction(ArrayRef<size_t> action, StringAttr inDim, size_t inSizeLog2)
       : action(action), inDim(inDim), inSizeLog2(inSizeLog2) {
     auto it = llvm::max_element(action);
 
@@ -10,6 +10,8 @@
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Tools/GenericSwizzling.h"
+#include "triton/Tools/LayoutUtils.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,6 +34,30 @@ constexpr int kPtrBitWidth = 64;
 // Max shmem LDS/STS instruction in bits
 constexpr int kMaxShmemVecBitLength = 128;
 
+static unsigned getBitwidth(RankedTensorType ty) {
+  auto isPtr = isa<PointerType>(ty.getElementType());
+  return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
+}
+
+static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                              RankedTensorType dstTy) {
+  auto *ctx = srcTy.getContext();
+  auto srcLayout = gpu::toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
+  auto dstLayout = gpu::toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+  srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
+  dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
+  auto bitwidth = getBitwidth(srcTy);
+  auto smem = gpu::optimalSwizzling(srcLayout, dstLayout, bitwidth);
+  auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
+  return smem.getTotalOutDimSize() / reps;
+}
+
+static unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                            RankedTensorType dstTy) {
+  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+  return getNumScratchElements(scratchConfig.paddedRepShape);
+}
+
 static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
@@ -135,12 +161,8 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
   scratchConfig.outVec = std::min(scratchConfig.outVec, contiguousShapeDim);
   // Clamp the vector length to kMaxShmemVecBitLength / element bitwidth as this
   // is the max vectorisation
-  auto inBitWidth = isa<PointerType>(srcTy.getElementType())
-                        ? kPtrBitWidth
-                        : srcTy.getElementTypeBitWidth();
-  auto outBitWidth = isa<PointerType>(dstTy.getElementType())
-                         ? kPtrBitWidth
-                         : dstTy.getElementTypeBitWidth();
+  auto inBitWidth = getBitwidth(srcTy);
+  auto outBitWidth = getBitwidth(dstTy);
   scratchConfig.inVec =
       std::min(scratchConfig.inVec, kMaxShmemVecBitLength / inBitWidth);
   scratchConfig.outVec =
@@ -174,27 +196,18 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
         op->getParentOfType<ModuleOp>());
     return std::max<int>(dstTy.getNumElements(), threadsPerWarp) *
-           std::max<int>(8, dstTy.getElementTypeBitWidth()) / 8;
+           getBitwidth(dstTy) / 8;
   }
   if (auto cvtLayout = dyn_cast<gpu::ConvertLayoutOp>(op)) {
     auto srcTy = cvtLayout.getSrc().getType();
     auto dstTy = cvtLayout.getType();
-    auto srcEncoding = srcTy.getEncoding();
-    auto dstEncoding = dstTy.getEncoding();
-    if (mlir::isa<gpu::SharedEncodingTrait>(srcEncoding) ||
-        mlir::isa<gpu::SharedEncodingTrait>(dstEncoding)) {
-      // Conversions from/to shared memory do not need scratch memory.
+    if (!cvtNeedsSharedMemory(srcTy, dstTy))
       return 0;
-    }
-    // ConvertLayoutOp with both input/output non-shared_layout
-    // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
-    //       also possible to realize it with other approaches in restricted
-    //       conditions, such as warp-shuffle
-    auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-    auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
-    return isa<PointerType>(srcTy.getElementType())
-               ? elems * kPtrBitWidth / 8
-               : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
+    // Pesimistically take the max. We will revisit later
+    auto elems = std::max(getNumScratchElemsSwizzledCvt(srcTy, dstTy),
+                          getNumScratchElemsPaddedCvt(srcTy, dstTy));
+
+    return elems * getBitwidth(srcTy) / 8;
   }
   if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
     auto value = op->getOperand(0);