intel
diff --git a/‎README.md‎
Lines changed: 13 additions & 0 deletions b/‎README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 6 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 16 additions & 8 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 7 additions & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 35 additions & 29 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 35 additions & 29 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 8 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 6 additions & 5 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 6 additions & 5 deletions
@@ -55,6 +55,17 @@ downloads a prebuilt LLVM, but you can also build LLVM from source and use that.
 LLVM does not have a stable API, so the Triton build will not work at an
 arbitrary LLVM version.
 
+For convenience, use the following command to build LLVM and install Triton with the custom LLVM:
+
+```shell
+make dev-install-llvm
+```
+
+<details>
+<summary>
+Alternatively, follow these steps to build LLVM from source manually.
+</summary>
+
 1. Find the version of LLVM that Triton builds against.  Check
 `cmake/llvm-hash.txt` to see the current version. For example, if it says:
        49af6502c6dcb4a7f7520178bd14df396f78240c
@@ -86,6 +97,8 @@ arbitrary LLVM version.
          LLVM_SYSPATH=$LLVM_BUILD_DIR \
          pip install -e .
 
+</details>
+
 # Tips for building
 
 - Set `TRITON_BUILD_WITH_CLANG_LLD=true` as an environment variable to use clang
 
@@ -296,8 +296,6 @@ class TritonLLVMIRRewriter : public IRRewriter, public TritonLLVMOpBuilder {
 // Types
 #define ptr_ty(...) LLVM::LLVMPointerType::get(__VA_ARGS__)
 #define int_ty(width) rewriter.getIntegerType(width)
-#define i64_ty rewriter.getIntegerType(64)
-#define i32_ty rewriter.getIntegerType(32)
 #define i16_ty rewriter.getIntegerType(16)
 #define i32_ty rewriter.getIntegerType(32)
 #define i64_ty rewriter.getIntegerType(64)
 
@@ -1002,11 +1002,11 @@ An encoding for tensors that have been produced by MFMA matrix core instructions
 available on AMD Instinct GPUs of CDNA architectures.
 
 It is characterized by the following parameters:
-- `versionMajor` and `versionMinor` indicates the GPU architecture:
-  - 1.0: gfx908, i.e. CDNA1
-  - 2.0: gfx90a: i.e. CDNA2
-  - 3.0: gfx942: CDNA3
-  - 4.0: gfx950: CDNA4
+- `version` indicates the GPU architecture:
+  - 1: gfx908: CDNA1
+  - 2: gfx90a: CDNA2
+  - 3: gfx942: CDNA3
+  - 4: gfx950: CDNA4
 - `warpsPerCTA` indicates the warp layout in the block.
 - `MDim` and `NDim` indicate the dimension of the output of the mfma instruction.
 - `isTransposed` indicates the result tensor is transposed so that it can be converted to dotOperand layout
@@ -1096,8 +1096,7 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
 
   let parameters = (
     ins
-    "unsigned": $versionMajor,
-    "unsigned": $versionMinor,
+    "unsigned": $version,
     ArrayRefParameter<"unsigned">:$warpsPerCTA,
     "unsigned":$MDim,
     "unsigned":$NDim,
 
@@ -85,7 +85,7 @@ class CoarseSchedule {
   using Cluster = ClusterList::iterator;
   using ClusterHash = size_t;
 
-  DenseMap<Operation *, std::pair<int, Cluster>> opToStageAndCluster;
+  llvm::MapVector<Operation *, std::pair<int, Cluster>> opToStageAndCluster;
 
   void setNumStages(int numStages) { this->numStages = numStages; }
   int getNumStages() const { return numStages; }
 
@@ -154,6 +154,10 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   assert(layout.getNumInDims() == indices.size());
   assert(llvm::equal(layout.getInDimNames(), llvm::make_first_range(indices)));
+  // Trivial layout
+  if (layout.getNumOutDims() == 0) {
+    return {};
+  }
 
   // This function can emit a lot of MLIR code, which ultimately makes
   // compilation slow.  (We think this shouldn't be the case -- it's not *that*
@@ -167,25 +171,29 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   SmallVector<std::pair<StringAttr, int32_t>> constantIns;
   SmallVector<std::pair<StringAttr, Value>> nonConstantIns;
   for (auto [inDimName, idx] : indices) {
-    if (auto constant = idx.getDefiningOp<LLVM::ConstantOp>()) {
-      constantIns.push_back(
-          {inDimName, cast<IntegerAttr>(constant.getValue()).getInt()});
+    APInt constant;
+    if (matchPattern(idx, m_ConstantInt(&constant))) {
+      constantIns.push_back({inDimName, constant.getSExtValue()});
     } else {
       constantIns.push_back({inDimName, 0});
       nonConstantIns.push_back({inDimName, idx});
     }
   }
-  SmallVector<int32_t> constantComponent =
-      llvm::to_vector(llvm::make_second_range(layout.apply(constantIns)));
 
+  // Compute constant part of the output and wrap it as values
   Value zero = b.i32_val(0);
   SmallVector<std::pair<StringAttr, Value>> outIndices;
-  for (auto [i, outDimName] : llvm::enumerate(layout.getOutDimNames())) {
-    if (constantComponent[i] == 0)
+  for (auto [outDimName, constant] : layout.apply(constantIns)) {
+    if (constant == 0)
       outIndices.push_back({outDimName, zero});
     else
-      outIndices.push_back({outDimName, b.i32_val(constantComponent[i])});
+      outIndices.push_back({outDimName, b.i32_val(constant)});
+  }
+
+  if (nonConstantIns.size() == 0) {
+    return outIndices;
   }
+
   // Happy path: Only one output.
   if (outIndices.size() == 1) {
     SmallVector<StringAttr> inDimNames;
 
@@ -504,9 +504,13 @@ struct MemDescSubviewOpConversion
       // The order gives us the honest-to-goodness layout rank
       auto srcAllocShape =
           srcTy.getAllocShape().take_back(getOrder(srcTy).size());
-      auto llInv = toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
-      offset =
-          applyLinearLayout(loc, rewriter, llInv, logicalOffsets)[0].second;
+      auto ll = toLinearLayout(srcAllocShape, srcTy.getEncoding());
+      // Checked in the verifier.
+      assert(ll.getInDimSize(str_attr("block")) == 1);
+      auto kOffset = str_attr("offset");
+      ll = ll.reshapeIns({{kOffset, ll.getTotalInDimSize()}});
+      offset = applyLinearLayout(loc, rewriter, ll.invert(), logicalOffsets)[0]
+                   .second;
     }
 
     auto base = smemObj.getBase();
 
@@ -23,7 +23,6 @@
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
-#include "triton/Tools/Sys/GetEnv.hpp"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/MathExtras.h"
@@ -428,6 +427,15 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
   return encoding;
 }
 
+bool isSplitCompatible(MLIRContext *ctx, const LinearLayout &ll) {
+  auto lastDim = ll.getNumOutDims() - 1;
+  auto kReg = StringAttr::get(ctx, "register");
+  auto kLastDim = StringAttr::get(ctx, "dim" + std::to_string(lastDim));
+  auto sublayout =
+      ll.sublayout({kReg}, {kLastDim}).removeZeroBasesAlongDim(kReg);
+  return sublayout == LinearLayout::identity1D(2, kReg, kLastDim);
+}
+
 LogicalResult tryJoinOnAxis(MLIRContext *ctx, const LinearLayout &inLl,
                             LinearLayout &outLl, bool fwdInference, int axis,
                             std::optional<Location> loc) {
@@ -1331,8 +1339,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
   if (parser.parseGreater().failed())
     return {};
 
-  unsigned versionMajor = 0;
-  unsigned versionMinor = 0;
+  unsigned version = 0;
   SmallVector<unsigned> warpsPerCTA;
   SmallVector<unsigned> instrShape;
   bool isTransposed;
@@ -1341,12 +1348,8 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
   std::optional<SmallVector<unsigned>> CTAOrder;
 
   for (const NamedAttribute &attr : dict) {
-    if (attr.getName() == "versionMajor") {
-      if (parseUInt(parser, attr, versionMajor, "versionMajor").failed())
-        return {};
-    }
-    if (attr.getName() == "versionMinor") {
-      if (parseUInt(parser, attr, versionMinor, "versionMinor").failed())
+    if (attr.getName() == "version") {
+      if (parseUInt(parser, attr, version, "verison").failed())
         return {};
     }
     if (attr.getName() == "warpsPerCTA") {
@@ -1385,14 +1388,13 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
     return {};
 
   return parser.getChecked<AMDMfmaEncodingAttr>(
-      parser.getContext(), versionMajor, versionMinor, warpsPerCTA,
-      instrShape[0], instrShape[1], isTransposed, *CTALayout);
+      parser.getContext(), version, warpsPerCTA, instrShape[0], instrShape[1],
+      isTransposed, *CTALayout);
 }
 
 void AMDMfmaEncodingAttr::print(AsmPrinter &printer) const {
   printer << "<{"
-          << "versionMajor = " << getVersionMajor()                      //
-          << ", versionMinor = " << getVersionMinor()                    //
+          << "version = " << getVersion()                                //
           << ", warpsPerCTA = [" << getWarpsPerCTA() << "]"              //
           << ", instrShape = [" << ArrayRef{getMDim(), getNDim()} << "]" //
           << ", isTransposed = " << getIsTransposed();
@@ -1401,17 +1403,12 @@ void AMDMfmaEncodingAttr::print(AsmPrinter &printer) const {
   printer << "}>";
 }
 
-LogicalResult
-AMDMfmaEncodingAttr::verify(function_ref<mlir::InFlightDiagnostic()> emitError,
-                            unsigned versionMajor, unsigned versionMinor,
-                            llvm::ArrayRef<unsigned int> warpsPerCTA,
-                            unsigned mDim, unsigned nDim, bool isTransposed,
-                            mlir::triton::gpu::CTALayoutAttr) {
-  if (!(versionMajor >= 0 && versionMajor <= 4)) {
-    return emitError() << "major version must be in the [0, 4] range";
-  }
-  if (versionMinor != 0) {
-    return emitError() << "minor version must be 0";
+LogicalResult AMDMfmaEncodingAttr::verify(
+    function_ref<mlir::InFlightDiagnostic()> emitError, unsigned version,
+    llvm::ArrayRef<unsigned int> warpsPerCTA, unsigned mDim, unsigned nDim,
+    bool isTransposed, mlir::triton::gpu::CTALayoutAttr) {
+  if (!(version >= 0 && version <= 4)) {
+    return emitError() << "version must be in the [0, 4] range";
   }
   if (!((mDim == 32 && nDim == 32) || (mDim == 16 && nDim == 16))) {
     return emitError()
@@ -1965,7 +1962,7 @@ SwizzledSharedEncodingAttr AMDMfmaEncodingAttr::composeSharedLayoutForOperand(
   bool isKContig = sharedOrder[0] == kDimIndex;
   // GFX950 supports LDS transpose load instructions, so we need swizzling even
   // when K dimension is not the contiguous dimension.
-  bool isGFX950 = getVersionMajor() == 4;
+  bool isGFX950 = getVersion() == 4;
   bool swizzleNonKContig =
       isGFX950 && (elemBitWidth == 8 || elemBitWidth == 16);
 
@@ -2654,7 +2651,19 @@ struct TritonGPUInferLayoutInterface
   inferDefaultJoinOpEncoding(Attribute srcEnc, Attribute &dstEnc,
                              ArrayRef<int64_t> shape,
                              std::optional<Location> loc) const override {
-    if (auto enc = mlir::dyn_cast<BlockedEncodingAttr>(srcEnc)) {
+    auto ctx = getContext();
+    if (auto enc = mlir::dyn_cast<SliceEncodingAttr>(srcEnc);
+        enc && enc.getDim() == shape.size()) {
+      SmallVector<int64_t> joinedShape(shape);
+      joinedShape.push_back(2);
+      auto parent = enc.getParent();
+      auto parentLL = toLinearLayout(joinedShape, parent);
+
+      if (isSplitCompatible(ctx, parentLL)) {
+        dstEnc = parent;
+        return success();
+      }
+    } else if (auto enc = mlir::dyn_cast<BlockedEncodingAttr>(srcEnc)) {
       // JoinOp takes two tensors of shape AxBxC and generates a tensor of shape
       // AxBxCx2. The encoding is the same as the input, but with 2 elems per
       // thread in the new dimension. The new dimension is the fastest running
@@ -2679,8 +2688,6 @@ struct TritonGPUInferLayoutInterface
       return success();
     }
 
-    auto ctx = getContext();
-
     // Append dim to shape
     auto ll = toLinearLayout(shape, srcEnc);
     SmallVector<int64_t> dstShape(shape.begin(), shape.end());
@@ -2757,7 +2764,6 @@ struct TritonGPUInferLayoutInterface
     if (!result.succeeded()) {
       return failure();
     }
-
     // Remove last dim from newLl (which should be 1)
     SmallVector<int64_t> dstShape(shape.begin(), shape.end());
     dstShape.pop_back();
 
@@ -1529,7 +1529,7 @@ chooseMfmaLikeStoreLayout(RankedTensorType valType) {
 
   Type elemType = valType.getElementType();
   if (!(valType.getRank() == 2 && (elemType.isF16() || elemType.isBF16()) &&
-        mfmaLayout.getVersionMajor() == 4 && mfmaLayout.getIsTransposed() &&
+        mfmaLayout.getVersion() == 4 && mfmaLayout.getIsTransposed() &&
         (isMfma32 || validForMfma16)))
     return {};
 
 
@@ -723,8 +723,14 @@ LogicalResult MemDescSubviewOp::verify() {
   auto ctx = getContext();
   // The order gives us the honest-to-goodness layout rank
   auto srcAllocShape = srcTy.getAllocShape().take_back(getOrder(srcTy).size());
-  auto llInv =
-      triton::gpu::toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+  auto ll = triton::gpu::toLinearLayout(srcAllocShape, srcTy.getEncoding());
+  // NYI: We don't support non-trivial block dimension for now.
+  auto kBlock = mlir::StringAttr::get(getContext(), "block");
+  if (ll.getInDimSize(kBlock) != 1) {
+    return emitError("non-trivial block dimension not supported");
+  }
+
+  auto llInv = ll.invert();
   auto kDim = mlir::StringAttr::get(ctx, "dim" + llvm::Twine(dim));
   llvm::SmallVector<std::pair<mlir::StringAttr, int32_t>> namedOffsets;
   for (auto d : standardOutDimNames(ctx, srcTy.getRank())) {
 
@@ -126,8 +126,8 @@ tt::CoarseSchedule::splitClusterBefore(Operation *op, scf::ForOp forOp) {
 bool tt::CoarseSchedule::isOpBefore(Operation *a, Operation *b) const {
   assert(opToStageAndCluster.count(a) && opToStageAndCluster.count(b) &&
          "Operations must be in the schedule");
-  auto [aStage, aCluster] = opToStageAndCluster.at(a);
-  auto [bStage, bCluster] = opToStageAndCluster.at(b);
+  auto [aStage, aCluster] = opToStageAndCluster.lookup(a);
+  auto [bStage, bCluster] = opToStageAndCluster.lookup(b);
   if (aStage != bStage) {
     return aStage < bStage;
   }
@@ -141,14 +141,15 @@ bool tt::CoarseSchedule::isOpInEarlierCluster(Operation *a,
                                               Operation *b) const {
   assert(opToStageAndCluster.count(a) && opToStageAndCluster.count(b) &&
          "Operations must be in the schedule");
-  return clusters.isBefore(opToStageAndCluster.at(a).second,
-                           opToStageAndCluster.at(b).second);
+  return clusters.isBefore(opToStageAndCluster.lookup(a).second,
+                           opToStageAndCluster.lookup(b).second);
 }
 
 bool tt::CoarseSchedule::isOpInSameCluster(Operation *a, Operation *b) const {
   assert(opToStageAndCluster.count(a) && opToStageAndCluster.count(b) &&
          "Operations must be in the schedule");
-  return opToStageAndCluster.at(a).second == opToStageAndCluster.at(b).second;
+  return opToStageAndCluster.lookup(a).second ==
+         opToStageAndCluster.lookup(b).second;
 }
 
 SmallVector<std::tuple<Operation *, int, tt::CoarseSchedule::Cluster>>