intel
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 8 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 10 additions & 1 deletion b/‎include/triton/Analysis/Utility.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 0 additions & 4 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 25 additions & 27 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 25 additions & 27 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 77 additions & 40 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 77 additions & 40 deletions
@@ -1,6 +1,7 @@
+default_stages: [pre-commit, pre-push, manual]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       - id: check-symlinks
       - id: destroyed-symlinks
@@ -17,12 +18,11 @@ repos:
       - id: debug-statements
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.7.1
     hooks:
       - id: ruff
         files: '^python/.*'
-        args: ["--fix", "--line-length", "120"]
-        stages: [pre-commit, pre-push, manual]
+        args: ["--fix", "--exit-non-zero-on-fix"]
         exclude: |
           (?x)(
             ^python/triton/runtime/.*|
@@ -31,18 +31,16 @@ repos:
           )
 
   - repo: https://github.com/google/yapf
-    rev: be72557
+    rev: "7e21823"
     hooks:
       - id: yapf
         args: ["-p", "-i"]
-        stages: [pre-commit, pre-push, manual]
         exclude: "python/test/unit/language/test_line_info.py"
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.6
+    rev: v19.1.2
     hooks:
       - id: clang-format
-        stages: [pre-commit, pre-push, manual]
 
   # Expand YAML anchors in files used by github workflows, because github can't
   # do this itself.  This lets us use anchors, which avoids code duplication.
 
@@ -87,6 +87,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUReorderInstructions();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
+  mlir::registerTritonAMDGPUConvertToBufferOps();
 
   // TODO: register Triton & TritonGPU passes
   registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
 
@@ -6,6 +6,7 @@
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Tools/LinearLayout.h"
 
 namespace mlir {
 
@@ -189,6 +190,14 @@ bool supportMMA(triton::DotOp op, int version);
 
 bool supportMMA(Value value, int version);
 
+// Conversion from `srcTy` to `dstTy` involving the minimum amount of data
+// transfer provided that both types can be converted to LL (if it can't it'll
+// return nullopt). The output will be such that layout.getInDimNames() ==
+// layout.getOutDimNames() and the conversion will not include kBlock (resp.
+// kWarp or kLane) if it can be avoided
+std::optional<mlir::triton::LinearLayout>
+minimalCvtLayout(RankedTensorType srcTy, RankedTensorType dstTy);
+
 // Conversion from `srcTy` to `dstTy` only involves reordering of registers.
 // There is no need for data exchange across threads, warps, or blocks.
 bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy);
@@ -203,7 +212,7 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool atomicNeedsSharedMemory(Value result);
 
-bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstT);
+bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool isMfmaToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 
 
@@ -82,6 +82,8 @@ class TargetInfoBase {
 
   virtual int getSharedAddressSpace() const = 0;
 
+  virtual bool supportVectorizedAtomics() const = 0;
+
   virtual ~TargetInfoBase() {}
 };
 } // namespace mlir::triton
 
@@ -44,10 +44,6 @@ std::optional<LinearLayout>
 toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
                std::optional<int32_t> elemBitWidth = std::nullopt);
 
-// Given a linear layout with input dims and output dims containing a "block"
-// dimension, determines if the layout moves data across block boundaries.
-bool isCrossCTAConversion(const LinearLayout &layout);
-
 // Given a linear layout where the input dimensions contain a "block" dimension,
 // this method sets the "block" dimension to 0 and removes the corresponding
 // output dimensions.
 
@@ -575,29 +575,20 @@ class LinearLayout {
     return *this;
   }
 
-  // divideLeft and divideRight are the inverses of operator*.
-  //
-  // Consider `a = c.divideRight(b)`, where `a` is a linear layout with
-  // `in-dims(a) == in-dims(b)` and `out-dims(a) == out-dims(c)`. We may remove
-  // some empty dimensions from `a` to form `a'` and still have `a' * b == c`.
-  // Therefore, there are multiple possible values that we could return for
-  // `(a * b).divideRight(b)` which would satisfy
-  // `((a * b).divideRight(b)) * b == a * b`.
-  //
-  // In the following example, we have `a * b == a' * b` when "in1" is an empty
-  // dimension that maps everything to 0:
-  //
-  //   a = L("in1", "in2") -> ("out1", "out2")
-  //   a' = L("in1") -> ("out1")
-  //   b = L("in2") -> ("out2")
-  //
-  // divideLeft and divideRight resolve this ambiguity by always returning the
-  // "canonical" quotient, namely the one with the fewest possible size-zero
-  // input and output dimensions.
-  //
-  // TODO(jlebar): Implement divideLeft.
-  // std::optional<LinearLayout> divideLeft(const LinearLayout &divisor);
-  std::optional<LinearLayout> divideRight(const LinearLayout &divisor) const;
+  // Returns true if this layout acts trivially (as the identity) on the given
+  // dimensions. This means that it's the identity on those dimensions, and it
+  // does not map other dimensions onto those or these onto other dimensions.
+  bool isTrivialOver(ArrayRef<StringAttr> dimNames) const;
+
+  // For an endomorphism on dimNames (linear map that maps dimNames to dimNames)
+  // checks whether it is the identity map on these dimensions (i.e
+  // LinearLayouts::isTrivialOver) and if so, returns the sublayout of the
+  // remaining dimensions.
+  // nb. The isTrivialOver condition is more restrictive than the usual
+  //     "leaves the subspace invariant" condition in maths.
+  //     We can always relax it if we know how to take advantage of a conversion
+  //     layout being block-diagonal in the future.
+  std::optional<LinearLayout> quotient(ArrayRef<StringAttr> dimNames) const;
 
   // Gets a layout with only these in/out dimensions.
   //
@@ -614,10 +605,10 @@ class LinearLayout {
   bool sublayoutIsZero(ArrayRef<StringAttr> inDimNames,
                        ArrayRef<StringAttr> outDimNames) const;
 
-  // Is the sublayout restricted to inDimNames + outDimNames and then flattened
-  // to 1D the identity layout (ignoring out-dim sizes)?
-  bool sublayoutIsIdentity(ArrayRef<StringAttr> inDimNames,
-                           ArrayRef<StringAttr> outDimNames) const;
+  // Is the sublayout defined from dimNames to dimNames the identity?
+  // In particular, is the input and  output size in these dimensions
+  // the same, and are the bases the identity?
+  bool squareSublayoutIsIdentity(ArrayRef<StringAttr> dimNames) const;
 
   // Computes and returns L(x, y, z).
   //
@@ -688,6 +679,13 @@ class LinearLayout {
   // (i.e. every input bit affects the output).
   llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks() const;
 
+  // Increase an input dimension without affecting the output dimension.  The
+  // added free variables are mapped to 0, ensuring that the new input
+  // dimensions correspond directly to the existing output space.  The function
+  // errors out if `newInDimSize` is less than the current size or the new size
+  // is not a power of 2.
+  LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const;
+
   std::string toString() const;
 
   friend bool operator==(LinearLayout lhs, LinearLayout rhs);
 
@@ -13,6 +13,7 @@ namespace mlir::triton {
 inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     // clang-format off
     "AMDGCN_ENABLE_DUMP",
+    "AMDGCN_USE_BUFFER_OPS",
     "DISABLE_FAST_REDUCTION",
     "DISABLE_LLVM_OPT",
     "DISABLE_MMA_V3",
 
@@ -543,7 +543,7 @@ bool supportMMA(Value value, int version) {
          (elemTy.isInteger(8) && version >= 2);
 }
 
-bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) {
+bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) {
   auto blockedLayout = dyn_cast<BlockedEncodingAttr>(srcTy.getEncoding());
   auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
   if (blockedLayout == nullptr || dotOperandLayout == nullptr)
@@ -647,57 +647,94 @@ bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
   return ans;
 }
 
-bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy) {
+// We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
+// under kBlock, kWarp or kLane (in that order). The idea here is that if we
+// have a transformation that's the identity on kBlock, we don't need to use
+// distributed shared memory. If it's also the identity on kWarp, we can
+// transfer via warp-shuffles, and if it's the identity on kLane just have to
+// reorder the registers
+std::optional<LinearLayout> minimalCvtLayout(RankedTensorType srcTy,
+                                             RankedTensorType dstTy) {
   MLIRContext *ctx = srcTy.getContext();
   std::optional<LinearLayout> srcLayout =
       toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
   std::optional<LinearLayout> dstLayout =
       toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
-  if (srcLayout.has_value() && dstLayout.has_value()) {
-    // comp describes the layout function for converting from src to dst.
-    LinearLayout comp = srcLayout->invertAndCompose(*dstLayout);
-    StringAttr kLane = StringAttr::get(ctx, "lane");
-    StringAttr kWarp = StringAttr::get(ctx, "warp");
-    StringAttr kBlock = StringAttr::get(ctx, "block");
-    // TODO(jlebar): These checks are overly-restrictive.  For example, we can
-    // transfer by shuffling registers (case 1) if and only if all of the bases
-    // for `register` have 0s for lane, warp, and block.  But the check below is
-    // stronger than this, checking also that the choice of lane/warp/block does
-    // not affect the permutation of registers.  If we allow different
-    // lane/warp/blocks to have different permutations, we can generalize this.
-    if (comp.divideRight(LinearLayout::identity1D(comp.getInDimSize(kLane),
-                                                  kLane, kLane) *
-                         LinearLayout::identity1D(comp.getInDimSize(kWarp),
-                                                  kWarp, kWarp) *
-                         LinearLayout::identity1D(comp.getInDimSize(kBlock),
-                                                  kBlock, kBlock))
-            .has_value()) {
-      return true;
+  if (!(srcLayout.has_value() && dstLayout.has_value()))
+    return std::nullopt;
+  StringAttr kRegister = StringAttr::get(ctx, "register");
+  StringAttr kLane = StringAttr::get(ctx, "lane");
+  StringAttr kWarp = StringAttr::get(ctx, "warp");
+  StringAttr kBlock = StringAttr::get(ctx, "block");
+  auto numSrcRegs = srcLayout->getInDimSize(kRegister);
+  auto numDstRegs = dstLayout->getInDimSize(kRegister);
+  // The `invertAndCompose` function will generate a layout that is injective
+  // by assigning new output dimensions to free variables.  For instance,
+  // consider a scenario where `srcLayout` has a free variable in the lane
+  // dimension, while `dstLayout` has two free variables in the lane
+  // dimension and also a larger number of registers.
+  // The injective form of `srcLayout` will add only a single additional row
+  // to the transformation matrix, whereas the injective form of `dstLayout`
+  // will add two additional rows.  This discrepancy causes misleading results
+  // because the matrices end up with a different number of rows.
+  //
+  // Take `dstLayout ⋅ srcLayout^-1` as an example:
+  //
+  //   - `injective(dstLayout)`: [n, m] → [n + 2, m]
+  //   - `injective(srcLayout)`: [n, m] → [n + 1, m]
+  //   - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1]
+  //   - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n +
+  //   1] → [n + 2, n + 1]
+  //
+  // Here, the `(n + 1)`-th row added by `dstLayout` represents the free
+  // variable in registers, and the `(n + 2)`-th row represents the free
+  // variable in lanes.  However, the `(n + 1)`-th row added by `srcLayout`
+  // represents the free variable in lanes.  As a result, the `(n + 1)`-th row
+  // in two layouts do not correspond to the same free variable.
+  //
+  // To address this issue, we pad the free variables in `srcLayout` and
+  // `dstLayout` to ensure they have the same number of registers.  This
+  // guarantees that the resulting matrices have the same number of rows,
+  // ensuring consistency in the composition process.
+  auto numRegs = std::max(numSrcRegs, numDstRegs);
+  auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs);
+  auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs);
+  // comp describes the layout function to create dst from src.
+  LinearLayout comp =
+      dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs);
+  // We try to quotient by the largest subspace first
+  auto dims = SmallVector<StringRef>{"block", "warp", "lane", "register"};
+  for (auto dim : dims) {
+    auto quotient = comp.quotient(StringAttr::get(ctx, dim));
+    if (!quotient.has_value()) {
+      break;
     }
+    comp = *quotient;
   }
-  return false;
+  return comp;
+}
+
+bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy) {
+  auto layout = minimalCvtLayout(srcTy, dstTy);
+  MLIRContext *ctx = srcTy.getContext();
+  if (!layout.has_value()) {
+    return false;
+  }
+  auto kRegister = StringAttr::get(ctx, "register");
+  auto outDims = llvm::to_vector(layout->getOutDimNames());
+  return outDims.empty() || ArrayRef(outDims) == ArrayRef({kRegister});
 }
 
 bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
+  auto layout = minimalCvtLayout(srcTy, dstTy);
   MLIRContext *ctx = srcTy.getContext();
-  std::optional<LinearLayout> srcLayout =
-      toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-  std::optional<LinearLayout> dstLayout =
-      toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
-  if (srcLayout.has_value() && dstLayout.has_value()) {
-    // comp describes the layout function for converting from src to dst.
-    LinearLayout comp = srcLayout->invertAndCompose(*dstLayout);
-    StringAttr kWarp = StringAttr::get(ctx, "warp");
-    StringAttr kBlock = StringAttr::get(ctx, "block");
-    if (comp.divideRight(LinearLayout::identity1D(comp.getInDimSize(kWarp),
-                                                  kWarp, kWarp) *
-                         LinearLayout::identity1D(comp.getInDimSize(kBlock),
-                                                  kBlock, kBlock))
-            .has_value()) {
-      return true;
-    }
+  if (!layout.has_value()) {
+    return false;
   }
-  return false;
+  auto kRegister = StringAttr::get(ctx, "register");
+  auto kLane = StringAttr::get(ctx, "lane");
+  return llvm::to_vector(layout->getOutDimNames()) ==
+         llvm::SmallVector<StringAttr, 2>{kRegister, kLane};
 }
 
 bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {