intel
diff --git a/‎include/triton/Analysis/AxisInfo.h‎
Lines changed: 8 additions & 10 deletions b/‎include/triton/Analysis/AxisInfo.h‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/Utility.h‎
Lines changed: 5 additions & 2 deletions b/‎include/triton/Dialect/TritonInstrument/IR/Utility.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h‎
Lines changed: 0 additions & 14 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 3 additions & 13 deletions b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 3 additions & 13 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 0 additions & 27 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 0 additions & 27 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 9 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp‎
Lines changed: 16 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp‎
Lines changed: 16 additions & 3 deletions
@@ -264,16 +264,14 @@ class ModuleAxisInfoAnalysis : public CallGraph<AxisInfoMapT> {
                                   axisinfo::CallbackType callback = nullptr)
       : CallGraph<AxisInfoMapT>(moduleOp) {
     SmallVector<FunctionOpInterface> funcs;
-    for (auto root : getRoots()) {
-      walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
-          // Pre-order edge walk callback
-          [](CallOpInterface callOp, FunctionOpInterface funcOp) {},
-          // Post-order node walk callback
-          [&](FunctionOpInterface funcOp) {
-            funcs.push_back(funcOp);
-            funcMap.try_emplace(funcOp, AxisInfoMapT{});
-          });
-    }
+    walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
+        // Pre-order edge walk callback
+        [](CallOpInterface callOp, FunctionOpInterface funcOp) {},
+        // Post-order node walk callback
+        [&](FunctionOpInterface funcOp) {
+          funcs.push_back(funcOp);
+          funcMap.try_emplace(funcOp, AxisInfoMapT{});
+        });
     SetVector<FunctionOpInterface> sortedFuncs(funcs.begin(), funcs.end());
     SymbolTableCollection symbolTable;
     for (auto funcOp : llvm::reverse(sortedFuncs)) {
 
@@ -281,6 +281,10 @@ bool comesFromLoadOrBlockArg(Value v);
 // `resultIdx`th result.
 SmallVector<Value> getTiedArgs(Operation *op, int resultIdx);
 
+// Verifies the provided memory descriptor type used for barrier allocation
+LogicalResult verifyBarrierType(Operation *op,
+                                mlir::triton::gpu::MemDescType barrierType);
+
 } // namespace mlir::triton
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -15,6 +15,10 @@ constexpr int TC_THREAD_OFFSET = TMA_THREAD_OFFSET + NUM_THREADS;
 constexpr int TOTAL_NUM_THREADS = TC_THREAD_OFFSET + NUM_THREADS;
 constexpr int THREADS_BITMASK_SIZE = llvm::NextPowerOf2(TOTAL_NUM_THREADS);
 
+namespace CommitKind {
+enum Kind { None = -1, AsyncCp = 0, Wgmma, TmaStore, NumCommitKinds };
+}
+
 Operation *createStoreScratchMemory(OpBuilder &b, Location loc, Value alloc,
                                     Value tensor, RankedTensorType tensorType);
 Value createLoadScratchMemory(OpBuilder &b, Location loc, Value alloc,
@@ -63,8 +67,7 @@ struct AuxDataMap {
   RegionToValueMap writeTracking[numMemTypes];
   RegionToValueMap readVisibility[numMemTypes];
   RegionToValueMap readTracking[numMemTypes];
-  RegionToValueMap asyncCpCommits;
-  RegionToValueMap wgmmaCommits;
+  RegionToValueMap commits[CommitKind::NumCommitKinds];
   RegionToValueMap lock;
   RegionToValueMap waiting;
 
 
@@ -29,10 +29,6 @@ template <typename... Args> int64_t gcd(int64_t a, int64_t b, Args... args) {
     return gcd(std::gcd(a, b), args...);
 }
 
-constexpr int log2Int(int64_t num) {
-  return (num > 1) ? 1 + log2Int(num / 2) : 0;
-}
-
 // If lhs * rhs overflows, return max value possible value for the type
 int64_t multiplyDivisor(int64_t lhs, int64_t rhs) {
   if (lhs > kMaxDivisor / rhs)
@@ -167,7 +163,6 @@ class AxisInfoAnalysis : public dataflow::SparseForwardDataFlowAnalysis<
                    axisinfo::CallbackType callback = nullptr);
   using dataflow::SparseForwardDataFlowAnalysis<
       dataflow::Lattice<AxisInfo>>::getLatticeElement;
-  using FuncAxisInfoMapT = DenseMap<FunctionOpInterface, AxisInfo>;
 
   LogicalResult
   visitOperation(Operation *op,
@@ -326,7 +321,6 @@ class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
       // with element locations:
       // [4, 5, 6, 7]
       // It is "strided contiguous" with a divisibility of 16 bytes
-      auto rank = lhs.getRank();
       auto elemSize = std::max<int64_t>(
           1, triton::getPointeeBitWidth(op.getPtr().getType()) / 8);
       rhsDivisibility = multiplyDivisor(rhs.getDivisibility(dim), elemSize);
@@ -345,7 +339,6 @@ class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
         return {lhs.getConstantValue().value() -
                 rhs.getConstantValue().value()};
       } else if constexpr (std::is_same_v<OpTy, triton::AddPtrOp>) {
-        auto rank = lhs.getRank();
         auto elemSize = std::max<int64_t>(
             1, triton::getPointeeBitWidth(op.getPtr().getType()) / 8);
         auto rhsValue = rhs.getConstantValue().value() * elemSize;
@@ -379,14 +372,12 @@ class MulIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::MulIOp> {
   int64_t getDivisibility(arith::MulIOp op, const AxisInfo &lhs,
                           const AxisInfo &rhs, int dim) override {
     auto lhsDivisibility = lhs.getDivisibility(dim);
-    if (lhs.getContiguity(dim) > 1 &&
-        !(rhs.getConstantValue().has_value() && rhs.getConstantValue() == 1)) {
+    if (lhs.getContiguity(dim) > 1 && rhs.getConstantValue() != 1) {
       // Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n
       lhsDivisibility = 1;
     }
     auto rhsDivisibility = rhs.getDivisibility(dim);
-    if (rhs.getContiguity(dim) > 1 &&
-        !(lhs.getConstantValue().has_value() && lhs.getConstantValue() == 1)) {
+    if (rhs.getContiguity(dim) > 1 && lhs.getConstantValue() != 1) {
       // Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n
       rhsDivisibility = 1;
     }
@@ -685,7 +676,7 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
     AxisInfo::DimVectorT contiguity, divisibility, constancy;
     std::optional<int64_t> constantValue;
     for (short d = 0; d < rank; ++d) {
-      int64_t constHint = 1;
+      int64_t constHint;
       if (lhsInfo.getConstantValue().has_value() &&
           rhsInfo.getConstantValue().has_value()) {
         constHint = shape[d];
@@ -907,7 +898,6 @@ class ShLIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::ShLIOp> {
       // Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n
       lhsDivisibility = 1;
     }
-    auto numBits = log2Int(lhsDivisibility);
     return multiplyDivisor(lhsDivisibility, 1ll << shift);
   }
 
 
@@ -719,31 +719,6 @@ class ScaledBlockedToMMA : public mlir::OpRewritePattern<triton::DotScaledOp> {
                                          mmaResult.newRetType, rewriter);
     Value newB = convertDotOperandForMMA(b, 1, minBitwidth,
                                          mmaResult.newRetType, rewriter);
-
-    // Compute tiles per warp for each operand
-    auto computeTilePerWarp = [&](Value operand, int operandIdx) -> unsigned {
-      auto operandTy = cast<RankedTensorType>(operand.getType());
-      auto dotEncoding = dyn_cast<triton::gpu::DotOperandEncodingAttr>(
-          operandTy.getEncoding());
-      if (!dotEncoding)
-        return 1;
-
-      const int bitWidth = operandTy.getElementType().getIntOrFloatBitWidth();
-      const int kWidth = dotEncoding.getKWidth();
-      auto rep = mmaResult.mmaEnc.getRepForOperand(
-          triton::gpu::getShapePerCTA(operandTy), bitWidth, kWidth,
-          dotEncoding.getOpIdx());
-
-      // repA = [repM, repK], repB = [repK, repN]
-      // For operand A (idx 0): return rep[1] (repK)
-      // For operand B (idx 1): return rep[2] (repN)
-      if (operandIdx == 0) {
-        return rep.size() >= 2 ? rep[1] : 1;
-      } else {
-        return rep.size() >= 3 ? rep[2] : 1;
-      }
-    };
-
     const auto mmaWarps = mmaResult.mmaEnc.getWarpsPerCTA(); // [wM, wN]
     // Convert scales to Linear layout
     auto convertScale = [&](Value scale, int opIdx) -> Value {
@@ -808,8 +783,6 @@ class ScaledBlockedToMMAv5
     // operands
     Value a = dotOp.getA();
     Value b = dotOp.getB();
-    auto oldAType = a.getType();
-    auto oldBType = b.getType();
 
     bool IsAMixedPrecFp4 = false;
     bool IsBMixedPrecFp4 = false;
 
@@ -1707,4 +1707,13 @@ SmallVector<Value> getTiedArgs(Operation *op, int resultIdx) {
   return {};
 }
 
+LogicalResult verifyBarrierType(Operation *op,
+                                mlir::triton::gpu::MemDescType barrierType) {
+  if (!barrierType.getElementType().isInteger(64) ||
+      barrierType.getShape() != ArrayRef<int64_t>({1}))
+    return op->emitOpError(
+        "barrier allocation must be a descriptor of 1xi64 type");
+  return success();
+}
+
 } // namespace mlir::triton
@@ -823,17 +823,30 @@ SetVector<int> assignIfOpPartitions(scf::IfOp ifOp) {
   for (int i = 0; i < thenYieldPartitions.size(); ++i) {
     auto &thenIds = thenYieldPartitions[i];
     auto &elseIds = elseYieldPartitions[i];
+    auto thenYieldOpnd = ifOp.thenYield()->getOperand(i);
+    auto elseYieldOpnd = ifOp.elseYield()->getOperand(i);
+    auto thenYieldOpndDefOp = thenYieldOpnd.getDefiningOp();
+    auto elseYieldOpndDefOp = elseYieldOpnd.getDefiningOp();
 
-    if (auto yieldOpnd = ifOp.thenYield()->getOperand(i);
-        isa<AsyncTokenType>(yieldOpnd.getType())) {
+    if (isa<AsyncTokenType>(thenYieldOpnd.getType())) {
       // Heuristic: when if-op yields an async-token, the output partition of
       //            the token is that of its producer
-      if (ifOp.thenBlock()->findAncestorOpInBlock(*yieldOpnd.getDefiningOp())) {
+      if (ifOp.thenBlock()->findAncestorOpInBlock(
+              *thenYieldOpnd.getDefiningOp())) {
         outputPartitions.push_back(elseIds);
       } else {
         outputPartitions.push_back(thenIds);
       }
+    } else if (thenYieldOpndDefOp &&
+               thenYieldOpndDefOp->getBlock() == ifOp.thenBlock()) {
+      // Heuristic: if yield operand is defined in then block, use its Ids
+      outputPartitions.push_back(thenIds);
+    } else if (elseYieldOpndDefOp &&
+               elseYieldOpndDefOp->getBlock() == ifOp.elseBlock()) {
+      // same for else block
+      outputPartitions.push_back(elseIds);
     } else {
+      // otherwise pick thenIds if avaialble, otherwise elseIds
       outputPartitions.push_back(!thenIds.empty() ? thenIds : elseIds);
     }
   }