intel
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 43 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonDialect.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonDialect.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 31 additions & 11 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 31 additions & 11 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 14 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 1 deletion b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 3 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 3 additions & 7 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 35 additions & 16 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 35 additions & 16 deletions
@@ -40,6 +40,12 @@ repos:
     hooks:
       - id: clang-format
 
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.15.0"
+    hooks:
+      - id: mypy
+        pass_filenames: false
+
   # Expand YAML anchors in files used by github workflows, because github can't
   # do this itself.  This lets us use anchors, which avoids code duplication.
   - repo: local
 
@@ -729,6 +729,49 @@ SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
 
 Value packLLVector(Location loc, ValueRange vals, RewriterBase &rewriter);
 
+inline std::optional<LLVM::AtomicBinOp> matchAtomicOp(RMWOp atomicOp) {
+  switch (atomicOp) {
+  case RMWOp::AND:
+    return LLVM::AtomicBinOp::_and;
+  case RMWOp::OR:
+    return LLVM::AtomicBinOp::_or;
+  case RMWOp::XOR:
+    return LLVM::AtomicBinOp::_xor;
+  case RMWOp::ADD:
+    return LLVM::AtomicBinOp::add;
+  case RMWOp::FADD:
+    return LLVM::AtomicBinOp::fadd;
+  case RMWOp::MAX:
+    return LLVM::AtomicBinOp::max;
+  case RMWOp::MIN:
+    return LLVM::AtomicBinOp::min;
+  case RMWOp::UMAX:
+    return LLVM::AtomicBinOp::umax;
+  case RMWOp::UMIN:
+    return LLVM::AtomicBinOp::umin;
+  case RMWOp::XCHG:
+    return LLVM::AtomicBinOp::xchg;
+  default:
+    return {};
+  }
+}
+
+inline std::optional<LLVM::AtomicOrdering>
+getMemoryOrdering(MemSemantic memOrdering) {
+  switch (memOrdering) {
+  case MemSemantic::RELAXED:
+    return LLVM::AtomicOrdering::monotonic;
+  case MemSemantic::ACQUIRE:
+    return LLVM::AtomicOrdering::acquire;
+  case MemSemantic::RELEASE:
+    return LLVM::AtomicOrdering::release;
+  case MemSemantic::ACQUIRE_RELEASE:
+    return LLVM::AtomicOrdering::acq_rel;
+  default:
+    return {};
+  }
+}
+
 inline bool
 isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
                            ArrayRef<int64_t> allocShape,
 
@@ -45,7 +45,8 @@ def Triton_Dialect : Dialect {
 
   let discardableAttrs = (ins
      "::mlir::IntegerAttr":$num_stages,
-     "::mlir::IntegerAttr":$latency
+     "::mlir::IntegerAttr":$latency,
+     "::mlir::IntegerAttr":$self_latency
   );
 
   let hasConstantMaterializer = 1;
 
@@ -20,11 +20,37 @@ namespace triton::nvidia_gpu {
 // Given an MMAv5 operation in a loop, determine if its accumulator can be
 // multibuffered.
 bool isAccMultibufferingPossible(MMAv5OpInterface mma, scf::ForOp forOp);
-// Only pipeline the loops where the MMA happens before the tmem_load, or is in
-// the same stage as the tmem_load. Lowering does not support the case where the
-// MMA is in a different stage as the tmem_load and happens after it.
-bool mmav5DominatesTmemLoads(
-    scf::ForOp forOp, function_ref<bool(MMAv5OpInterface)> isMmaPipelineable);
+
+// Returns true if the MMA operation requires acc multi-buffering when
+// pipelined.
+bool requiresAccMultiBuffering(MMAv5OpInterface mma, scf::ForOp forOp);
+
+// Returns true if there are loads from tmem after the MMA operation.
+bool hasLoadsAfterMMA(MMAv5OpInterface mma, scf::ForOp forOp);
+
+// Helper class to determine if the operands of an MMA operation are
+// pipelineable.
+class MMAv5PipelineableOperandsHelper {
+public:
+  MMAv5PipelineableOperandsHelper(
+      MMAv5OpInterface mmaOp, scf::ForOp forOp,
+      std::function<bool(Operation *)> isLoadToBePipelined)
+      : mmaOp(mmaOp), forOp(forOp), isLoadToBePipelined(isLoadToBePipelined) {
+    run();
+  }
+  bool isPipelineable = false;
+  // If true, the existing operand loads are all been found and their
+  // pipelineability has been determined.
+  bool isOperandsStateDetermined = false;
+  SmallVector<Operation *> unpipelineableOperandLoads;
+
+private:
+  MMAv5OpInterface mmaOp;
+  scf::ForOp forOp;
+  std::function<bool(Operation *)> isLoadToBePipelined;
+  bool comesFromLoadOrOutsideLoop(Value v, Operation *&foundLoad);
+  void run();
+};
 
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Rewriters
@@ -35,12 +61,6 @@ bool mmav5DominatesTmemLoads(
 TMEMAllocOp createTMemAlloc(OpBuilder &builder, TMEMAllocOp oldTMemAllocOp,
                             bool multiBufferred, int numStages);
 
-// Return true if operands of the MMA operation are/are going to be pipelined
-// and multibuffered, enabling the MMA operation to be pipelined.
-bool mmaHasPipelineableOperands(
-    MMAv5OpInterface mma, scf::ForOp forOp,
-    std::function<bool(Operation *)> isLoadPipelineable);
-
 // Return true if the accumulator of an mma in subsequent iterations is either
 // independent from the previous iteration (overwritten) or completely reused,
 // without read-modify-write.
 
@@ -91,6 +91,11 @@ int getCopyVecBytes(RankedTensorType registerTy,
 // attribute.
 void serializeLatencies(ModuleOp module, DenseMap<Operation *, int> &opLatency);
 
+// Serialize the self latencies of the operations in the loops into the
+// self_latency attribute.
+void serializeSelfLatencies(ModuleOp module,
+                            DenseMap<Operation *, int> &opSelfLatency);
+
 // Deserialize the latencies of the operations in the loops from the attribute.
 DenseMap<Operation *, int> deserializeLatencies(Operation *op);
 
@@ -107,6 +112,9 @@ Value createAlloc(scf::ForOp forOp, RankedTensorType ty, Location loc,
 // Determine if the operation is a TMA load.
 bool isTMALoad(Operation *op);
 
+// Determine if the operation can be lowered to an async load.
+bool canBeAsyncLoad(Operation *op);
+
 // Look for consecutive wait ops and combine them into a single wait op.
 void combineRedundantWaitOps(
     llvm::SmallSetVector<gpu::AsyncWaitOp, 8> &waitOps);
 
@@ -125,6 +125,20 @@ class CoarseSchedule {
 
   auto find(Operation *op) const { return opToStageAndCluster.find(op); }
 
+  // Split the cluster containing op into two clusters, one containing all
+  // operations before the op and one containing op and all operations after the
+  // op. Return the cluster containing op and all operations after the op.
+  Cluster splitClusterBefore(Operation *op, scf::ForOp forOp);
+
+  // Check if op a will show up before op b in the final unrolled code.
+  bool isOpBefore(Operation *a, Operation *b);
+
+  // Check if op a is in earlier cluster than op b.
+  bool isOpInEarlierCluster(Operation *a, Operation *b);
+
+  // Check if op a is in the same cluster as op b.
+  bool isOpInSameCluster(Operation *a, Operation *b);
+
   SmallVector<std::tuple<Operation *, int, Cluster>>
   getOpsInOrder(scf::ForOp forOp);
   std::vector<std::pair<Operation *, unsigned>>
 
@@ -44,7 +44,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "STORE_TMEM_TO_GLOBAL_BYPASS_SMEM",
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
     "TRITON_F32_DEFAULT",
-    "ENABLE_MMA_V5_ATT_PIPELINE",
     "TRITON_INTEL_ADVANCED_PATH",
     "TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
     "TRITON_INTEL_DO_NOT_SINK_INSTR_ACROSS_RGN",
 
@@ -217,8 +217,9 @@ getSharedMemoryScale(Value arg, mlir::PatternRewriter &rewriter, Location loc) {
       SharedMemorySpaceAttr::get(argType.getContext());
   auto CTALayout = getCTALayout(argType.getEncoding());
   // No swizzling for scale for now
-  auto newLayout = SwizzledSharedEncodingAttr::get(argType.getContext(), 1, 1,
-                                                   1, newOrder, CTALayout);
+  auto newLayout = NVMMASharedEncodingAttr::get(
+      argType.getContext(), argType.getShape(), newOrder, CTALayout,
+      argType.getElementType(), false);
   auto newType = MemDescType::get(argType.getShape(), argType.getElementType(),
                                   newLayout, SharedMemorySpace);
   rewriter.setInsertionPointAfterValue(arg);
 
@@ -316,14 +316,10 @@ class UseShmemForScales
       return false;
 
     auto sharedEnc =
-        cast<triton::gpu::SwizzledSharedEncodingAttr>(scaleType.getEncoding());
-    if (sharedEnc.getMaxPhase() != 1 || sharedEnc.getPerPhase() != 1 ||
-        sharedEnc.getVec() != 1) {
-      // For now, we do not expect swizzling to be applied to the scale SMEM.
-      // This is currently true for non-matmul operand SMEM allocated during
-      // pipelining.
+        dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(scaleType.getEncoding());
+    if (!sharedEnc || sharedEnc.getTransposed() || sharedEnc.getFp4Padded() ||
+        sharedEnc.getSwizzlingByteWidth() != 0)
       return false;
-    }
 
     if (usesTMAload) {
       return true;
 
@@ -260,37 +260,56 @@ class AssignMMALatencies {
       : forOp(forOp), opLatency(opLatency) {};
 
   void run() {
-    if (!triton::tools::getBoolEnv("ENABLE_MMA_V5_ATT_PIPELINE")) {
-      int mmav5Count = 0;
-      for (auto &op : forOp.getBody()->without_terminator()) {
-        if (isa<ttng::MMAv5OpInterface>(&op)) {
-          mmav5Count++;
-        }
-      }
-      if (mmav5Count > 1)
-        return;
-    }
+    DenseMap<Operation *, int> mmaSelfLatency;
     // Check if the load op (mma operand) is pipelineable.
-    auto isLoadPipelineable = [&](Operation *op) {
+    auto isLoadToBePipelined = [&](Operation *op) {
       return opLatency.count(op) && opLatency[op] > 0;
     };
     for (auto &op : forOp.getBody()->without_terminator()) {
       // If the acc can not be multibuffered, do not pipeline the uses of
       // the MMA to later stages.
       if (auto mma = dyn_cast<ttng::MMAv5OpInterface>(&op)) {
-        if (ttng::mmaHasPipelineableOperands(mma, forOp, isLoadPipelineable) &&
-            !ttng::hasAccReadModifyWrite(mma, forOp) &&
-            ttng::isAccMultibufferingPossible(mma, forOp) &&
-            !getDisallowAccMultiBuffer(forOp)) {
-          opLatency[&op] = 1;
+        // Try to push out the wait by one stage even if the operands are not
+        // pipelineable, but we know where the loads are scheduled, so we can
+        // place the wait right before the loads.
+
+        if (hasSyncDots(forOp)) {
+          // Skip pipelining MMA in the loops where sync dots are used. This is
+          // dirty heuristic for performance drops in kernels where we would
+          // rather want to have last iteration peeled instead of having a full
+          // iteration of masked operations only to execute single wait.
+          continue;
+        }
+        auto pipeHelper = ttng::MMAv5PipelineableOperandsHelper(
+            mma, forOp, isLoadToBePipelined);
+        if (pipeHelper.isPipelineable ||
+            (pipeHelper.isOperandsStateDetermined &&
+             !ttng::hasLoadsAfterMMA(mma, forOp))) {
+          // MMA can be overlapped with itself
+          mmaSelfLatency[mma] = 1;
+          if (!ttng::requiresAccMultiBuffering(mma, forOp) ||
+              (ttng::isAccMultibufferingPossible(mma, forOp) &&
+               !getDisallowAccMultiBuffer(forOp))) {
+            // MMA's users can be pushed to the next stage
+            opLatency[&op] = 1;
+          }
         }
       }
     }
+    serializeSelfLatencies(forOp->getParentOfType<ModuleOp>(), mmaSelfLatency);
   }
 
 private:
   scf::ForOp forOp;
   DenseMap<Operation *, int> &opLatency;
+
+  bool hasSyncDots(scf::ForOp forOp) {
+    for (auto &op : forOp.getBody()->without_terminator()) {
+      if (isa<mlir::triton::DotOp>(op))
+        return true;
+    }
+    return false;
+  }
 };
 
 } // namespace