intel
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 11 additions & 19 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 11 additions & 19 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 46 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 20 additions & 0 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎lib/Dialect/Triton/Transforms/LoopAwareCSE.cpp‎
Lines changed: 3 additions & 0 deletions b/‎lib/Dialect/Triton/Transforms/LoopAwareCSE.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 48 additions & 31 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 48 additions & 31 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 4 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 4 additions & 0 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 4 additions & 0 deletions
@@ -119,15 +119,17 @@ Alternatively, follow these steps to build LLVM from source manually.
   Without this, every invocation of `pip install` uses a different symlink to
   cmake, and this forces ninja to rebuild most of the `.a` files.
 
-- vscode intellisense has some difficulty figuring out how to build Triton's C++
-  (probably because, in our build, users don't invoke cmake directly, but
-  instead use setup.py).  Teach vscode how to compile Triton as follows.
+- The build system creates a `compile_commands.json` file under the Triton repo
+  directory. This file is used by VSCode IntelliSense and clangd to provide
+  code completion and other features for C++ code.
+
+  If IntelliSense does not work, you can try the following steps:
 
     - Do a local build. Run command `pip install -e .`
     - Get the full path to the `compile_commands.json` file produced by the build:
       `find ./build -name 'compile_commands.json' | xargs readlink -f`.
       You might get a full path similar to `/Users/{username}/triton/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json`
-    - In vscode, install the
+    - In VSCode, install the
       [C/C++
       extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools),
       then open the command palette (`Shift + Command + P` on Mac, or `Shift +
 
@@ -64,6 +64,8 @@ class ReduceOpHelper {
 
   bool isReduceWithinCTA();
 
+  bool isAssociative();
+
 private:
   triton::ReduceOp op;
   ArrayRef<int64_t> srcShape;
 
@@ -286,25 +286,10 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
         }
 
         // ---- begin WMMA ----
-        if (mlir::isa<AMDWmmaEncodingAttr>(dotOpEnc.getParent())) {
-          if (dotOpEnc.getOpIdx() == 0) {
-            const int numBanks = 32;
-            const int bankBitWidth = 32;
-
-            // number of inner dimension rows per one pattern repeat
-            int innerDimLength = shape[order[0]];
-            int elemsPerOneBanksRow = (numBanks * bankBitWidth) / typeWidthInBit;
-
-            int perPhase = std::max(1, elemsPerOneBanksRow / innerDimLength);
-            int vecSize = ((typeWidthInBit == 16) ? 64 : 32 ) / typeWidthInBit;
-            int maxPhase = 16 / perPhase;
-
-            return get(context, vecSize, perPhase, maxPhase, order, CTALayout);
-          } else {
-            // Do not swizzle in case k dimension is not innermost.
-            // In this case accesses will go in different banks even without swizzling.
-            return get(context, 1, 1, 1, order, CTALayout);
-          }
+        if (auto wmmaEnc = mlir::dyn_cast<AMDWmmaEncodingAttr>(dotOpEnc.getParent())) {
+          return wmmaEnc.composeSharedLayoutForOperand(
+              CTALayout, dotOpEnc.getOpIdx(), shape, order, dotOpEnc.getKWidth(),
+              typeWidthInBit, needTrans);
         }
 
 
@@ -1230,6 +1215,13 @@ Row |
                                           Type elemType, int kWidth, int kDim, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
     static SmallVector<unsigned> getMNKDimPerInstr();
+
+    // Returns a swizzled shared layout matching this WMMA layout for the
+    // dot operand at the given |operandIdx| with |operandShape|.
+    SwizzledSharedEncodingAttr composeSharedLayoutForOperand(
+        CTALayoutAttr ctaLayout, int operandIdx, ArrayRef<int64_t> operandShape,
+        ArrayRef<unsigned> sharedOrder, unsigned kWidth,
+        unsigned elemBitWidth, bool needTrans) const;
   }];
 }
 
 
@@ -523,6 +523,52 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
   }];
 }
 
+def TTNG_TCGen5CommitOp : TTNG_Op<"tc_gen5_commit"> {
+  let summary = "make an mbarrier track completion of all prior async tcgen5 ops";
+
+  let description = [{
+    The `ttng.tc_gen5_commit` is an asynchronous operation that makes the
+    mbarrier object track the completion of all prior asynchronous tcgen5
+    operations. Upon completion of all asynchronous operations, the mbarrier
+    arrive operation is performed on the mbarrier with a count of 1.
+
+    If `two_ctas` is set, then the mbarrier tracks all prior operations
+    initiated with `two_ctas` set as well. Otherwise, it tracks all prior
+    operations initiated without `two_ctas`.
+
+    Note that the completion mechanisms are guaranteed to occur sequentially in
+    the order the commit operations were issued. This means, for example:
+
+    ```mlir
+    ttng.tmem_copy
+    ttng.tc_gen5_mma
+    ttng.tc_gen5_commit %barrierA
+    ttng.tc_gen5_commit %barrierB
+    ```
+
+    `%barrierA` tracks the completion of the previous TMEM copy and MMA
+    operations, but since the commit groups are sequential, the arrive-on
+    operation on `%barrierA` is guaranteed to be performed before the arrive-on
+    operation on `%barrierB`, even though its commit group is empty.
+  }];
+
+  let arguments = (ins
+    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$barrier,
+    Optional<I1>:$pred,
+    UnitAttr:$two_ctas
+  );
+
+  let assemblyFormat = [{
+    $barrier (`,` $pred^)? attr-dict `:` qualified(type($barrier))
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$barrier, CArg<"bool", "false">:$two_ctas), [{
+      build($_builder, $_state, barrier, /*pred=*/Value(), two_ctas);
+    }]>,
+  ];
+}
+
 def TTNG_TMEMLoadOp : TTNG_Op<"tmem_load"> {
   let summary = "Load a buffer from tensor memory into a distributed tensor";
 
 
@@ -142,6 +142,26 @@ bool ReduceOpHelper::isReduceWithinCTA() {
   return getCTASplitNum(srcEncoding)[axis] == 1;
 }
 
+bool ReduceOpHelper::isAssociative() {
+  auto dtype = srcElementTypes[0];
+  if (!type::isFloat(dtype))
+    return true;
+  size_t reduce_size = srcShape[axis];
+  if (reduce_size <= 2)
+    return true;
+  bool hasNoAssociativeOp = false;
+  op.walk([&](Operation *nestedOp) -> WalkResult {
+    if (isa<arith::AddFOp, arith::MulFOp>(nestedOp)) {
+      // Only when the data type is float point and reduce size greater than 2,
+      // and has addf or mulf op, we though it's a non-associative reduce.
+      hasNoAssociativeOp = true;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return !hasNoAssociativeOp;
+}
+
 unsigned ScanLoweringHelper::getAxisNumElementsPerThread() {
   return getEncoding().getContigPerThread()[getAxis()];
 }
 
@@ -93,6 +93,9 @@ bool LoopCSEDriver::areEqualInLoop(Value a, Value b) {
 
   Operation *aDef = a.getDefiningOp();
   Operation *bDef = b.getDefiningOp();
+  if (cast<OpResult>(a).getResultNumber() !=
+      cast<OpResult>(b).getResultNumber())
+    return false;
   // For it to be known that the operation results have the same value, they
   // must be side effect free.
   if (!isMemoryEffectFree(aDef) || !isMemoryEffectFree(bDef))
 
@@ -427,15 +427,6 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
   return encoding;
 }
 
-bool isSplitCompatible(MLIRContext *ctx, const LinearLayout &ll) {
-  auto lastDim = ll.getNumOutDims() - 1;
-  auto kReg = StringAttr::get(ctx, "register");
-  auto kLastDim = StringAttr::get(ctx, "dim" + std::to_string(lastDim));
-  auto sublayout =
-      ll.sublayout({kReg}, {kLastDim}).removeZeroBasesAlongDim(kReg);
-  return sublayout == LinearLayout::identity1D(2, kReg, kLastDim);
-}
-
 LogicalResult tryJoinOnAxis(MLIRContext *ctx, const LinearLayout &inLl,
                             LinearLayout &outLl, bool fwdInference, int axis,
                             std::optional<Location> loc) {
@@ -2056,6 +2047,42 @@ SmallVector<unsigned> AMDWmmaEncodingAttr::getMNKDimPerInstr() {
   return {16, 16, 16};
 }
 
+SwizzledSharedEncodingAttr AMDWmmaEncodingAttr::composeSharedLayoutForOperand(
+    CTALayoutAttr ctaLayout, int operandIdx, ArrayRef<int64_t> operandShape,
+    ArrayRef<unsigned> sharedOrder, unsigned kWidth, unsigned elemBitWidth,
+    bool needTrans) const {
+  int kDimIndex = operandIdx == 0 ? 1 : 0;
+  bool isKContig = sharedOrder[0] == kDimIndex;
+
+  if (!isKContig) {
+    // Do not swizzle. In this case accesses will go in different banks even
+    // without swizzling.
+    return SwizzledSharedEncodingAttr::get(getContext(), 1, 1, 1, sharedOrder,
+                                           ctaLayout);
+  }
+
+  // max vectorization size for ds_load is 128 bits
+  int vectorSize = std::min(kWidth * elemBitWidth, 128u) / elemBitWidth;
+
+  const int numBanks = 32;
+  const int bankBitWidth = 32;
+
+  // Number of inner dimension rows per one pattern repeat
+  int innerDimLength = operandShape[sharedOrder[0]];
+  int elemsPerOneBanksRow = (numBanks * bankBitWidth) / elemBitWidth;
+
+  int perPhase = std::max(1, elemsPerOneBanksRow / innerDimLength);
+  // for both RDNA3 and RDNA4, the M/N dimension of wmma is 16
+  // This represents the max number of rows that can be accessed
+  // at the same time
+  int mDim = getMNKDimPerInstr()[0];
+  int maxPhase =
+      std::max(std::min(mDim / perPhase, innerDimLength / vectorSize), 1);
+
+  return SwizzledSharedEncodingAttr::get(getContext(), vectorSize, perPhase,
+                                         maxPhase, sharedOrder, ctaLayout);
+}
+
 //===----------------------------------------------------------------------===//
 // Mma encoding
 //===----------------------------------------------------------------------===//
@@ -2659,7 +2686,9 @@ struct TritonGPUInferLayoutInterface
       auto parent = enc.getParent();
       auto parentLL = toLinearLayout(joinedShape, parent);
 
-      if (isSplitCompatible(ctx, parentLL)) {
+      Attribute splitEnc;
+      auto result = inferSplitOpEncoding(parent, splitEnc, joinedShape, loc);
+      if (succeeded(result) && areLayoutsEquivalent(shape, splitEnc, srcEnc)) {
         dstEnc = parent;
         return success();
       }
@@ -2709,28 +2738,16 @@ struct TritonGPUInferLayoutInterface
   inferSplitOpEncoding(Attribute srcEnc, Attribute &dstEnc,
                        ArrayRef<int64_t> shape,
                        std::optional<Location> loc) const override {
+    // SplitOp takes a tensor of shape AxBxCx2 and generates two tensors of
+    // shape AxBxC.  The input must have 2 elements per thread in the last
+    // dimension, which must be the fastest running dimension. The result
+    // encoding is the same as the input, but with the last dimension removed.
     auto enc = mlir::dyn_cast<BlockedEncodingAttr>(srcEnc);
-    if (enc) {
-      // SplitOp takes a tensor of shape AxBxCx2 and generates two tensors of
-      // shape AxBxC.  The input must have 2 elements per thread in the last
-      // dimension, which must be the fastest running dimension. The result
-      // encoding is the same as the input, but with the last dimension removed.
-      if (enc.getSizePerThread().back() != 2) {
-        return emitOptionalError(
-            loc, "SplitOp requires 2 elements per thread in the "
-                 "last dimension of the input");
-      }
-      if (enc.getThreadsPerWarp().back() != 1 ||
-          enc.getWarpsPerCTA().back() != 1 || enc.getCTAsPerCGA().back() != 1) {
-        return emitOptionalError(
-            loc, "SplitOp requires threadsPerWarp, warpsPerCTA, "
-                 "and CTAsPerCGA = 1 for the last dimension of the input");
-      }
-      if (enc.getCTALayout().getCTAsPerCGA().back() != 1) {
-        return emitOptionalError(
-            loc,
-            "SplitOp requires the last dimension to be most-minor in CTAOrder");
-      }
+    bool isSimpleSplit = (enc && (enc.getSizePerThread().back() == 2) &&
+                          (enc.getThreadsPerWarp().back() == 1) &&
+                          (enc.getWarpsPerCTA().back() == 1) &&
+                          (enc.getCTAsPerCGA().back() == 1));
+    if (isSimpleSplit) {
       SmallVector<unsigned> newOrder(enc.getOrder());
       int splitDim = newOrder.size() - 1;
       // Remove splitDim from order.
 
@@ -1229,6 +1229,13 @@ void LayoutRematerialization::backwardRematerialization(
       // Reduce op introduce much cost.
       auto reduceOp = dyn_cast<ReduceOp>(op);
       ReduceOpHelper helper(reduceOp);
+      if (!helper.isAssociative()) {
+        // We shouldn't rematerize a no associative reduce op if it has multiple
+        // use chain.
+        LDBG("  skipped rematerialization due to non-associative reduce in the "
+             "slice");
+        return;
+      }
       rematerialisationCost += helper.getIntraWarpSizeWithUniqueData();
       rematerialisationCost += 8 * helper.getInterWarpSizeWithUniqueData();
     }
 
@@ -1573,6 +1573,10 @@ bool comesFromLoadOrBlockArg(Value v) {
       v = cvtOp.getSrc();
       continue;
     }
+    if (auto transOp = dyn_cast<tt::TransOp>(def)) {
+      v = transOp.getSrc();
+      continue;
+    }
     if (def->hasTrait<OpTrait::MemDescViewTrait>()) {
       v = def->getOperand(0);
       continue;
 
@@ -448,6 +448,10 @@ void init_gluon_ir(py::module &&m) {
                                             pred, two_ctas, mbarriers,
                                             mbarrier_preds);
            })
+      .def("create_tcgen05_commit",
+           [](GluonOpBuilder &self, Value &barrier) {
+             self.create<ttng::TCGen5CommitOp>(barrier);
+           })
 
       .def("create_async_tma_copy_global_to_local",
            [](GluonOpBuilder &self, Value descPtr, std::vector<Value> &coord,