intel
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td‎
Lines changed: 10 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 9 additions & 1 deletion b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 19 additions & 3 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/InterleaveTMem.cpp‎
Lines changed: 260 additions & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/InterleaveTMem.cpp‎
Lines changed: 260 additions & 0 deletions
@@ -29,6 +29,7 @@ include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
 include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 include "mlir/IR/OpBase.td"
@@ -584,6 +585,12 @@ def TTNG_TMEMStoreOp : TTNG_Op<"tmem_store"> {
   );
   let results = (outs Optional<TTG_AsyncToken>:$token);
 
+  let builders = [
+    OpBuilder<(ins "Value":$dst, "Value":$src, "Value":$pred), [{
+      build($_builder, $_state, Type(), dst, Value(), src, pred);
+    }]>
+  ];
+
   let assemblyFormat = [{
     $src `,` $dst `` custom<Token>($dep, type($token)) `,` $pred
     attr-dict `:` type($src) `->` qualified(type($dst))
 
@@ -64,6 +64,8 @@ std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeDescriptorEncodingPass();
 
 std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeTMemLayoutsPass();
 
+std::unique_ptr<Pass> createTritonNvidiaGPUInterleaveTMemPass();
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_TRITONNVIDIAGPULEGALIZETMALAYOUTS
 
@@ -143,6 +143,16 @@ def TritonNvidiaGPUOptimizeTMemLayoutsPass : Pass<"triton-nvidia-optimize-tmem-l
                            "mlir::triton::TritonDialect"];
 }
 
+def TritonNvidiaGPUInterleaveTMemPass : Pass<"triton-nvidia-interleave-tmem", "mlir::ModuleOp"> {
+  let summary = "Interleave TMEM loads/stores.";
+
+  let description = [{
+    The `triton-nvidia-interleave-tmem` pass attempts to sink TMEM loads and
+    hoist TMEM stores, and potentially interleave them, to reduce register
+    pressure.
+  }];
+}
+
 def TritonNvidiaGPURemoveTMEMTokensPass : Pass<"triton-nvidia-gpu-remove-tmem-tokens", "mlir::ModuleOp"> {
   let summary = "remove TMEM tokens";
 
 
@@ -445,9 +445,17 @@ MemDescTransOp::inferReturnTypes(MLIRContext *context,
 // MemDescReshapeOp
 
 LogicalResult MemDescReshapeOp::verify() {
-  // Infer the dst layout from the source and verify that it is equivalent.
   MemDescType dstType = getResult().getType();
   MemDescType srcType = getSrc().getType();
+  if (product(dstType.getShape()) != product(srcType.getShape())) {
+    return emitError(
+        "number of src and dst elements of reshape must be the same");
+  }
+  if (dstType.getElementType() != srcType.getElementType()) {
+    return emitError("result element type must match src element type");
+  }
+
+  // Infer the dst layout from the source and verify that it is equivalent.
   auto srcEncoding = srcType.getEncoding();
   Attribute inferedDstEncoding;
 
 
@@ -244,8 +244,15 @@ static void printToken(OpAsmPrinter &p, Operation *op, Value dep, Type token) {
 void TCGen5MMAOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
+  // The op reads the accumulator if `useD` is not known to be false.
+  APInt useD;
+  if (!matchPattern(getUseD(), m_ConstantInt(&useD)) || !useD.isZero()) {
+    effects.emplace_back(MemoryEffects::Read::get(), &getDMutable(),
+                         TensorMemory::get());
+  }
   effects.emplace_back(MemoryEffects::Write::get(), &getDMutable(),
                        TensorMemory::get());
+
   if (isa<SharedMemorySpaceAttr>(getA().getType().getMemorySpace())) {
     effects.emplace_back(MemoryEffects::Read::get(), &getAMutable(),
                          SharedMemory::get());
@@ -296,8 +303,15 @@ void TCGen5MMAOp::build(OpBuilder &builder, OperationState &state, Type token,
 void TCGen5MMAScaledOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
+  // The op reads the accumulator if `useD` is not known to be false.
+  APInt useD;
+  if (!matchPattern(getUseD(), m_ConstantInt(&useD)) || !useD.isZero()) {
+    effects.emplace_back(MemoryEffects::Read::get(), &getDMutable(),
+                         TensorMemory::get());
+  }
   effects.emplace_back(MemoryEffects::Write::get(), &getDMutable(),
                        TensorMemory::get());
+
   if (isa<SharedMemorySpaceAttr>(getA().getType().getMemorySpace())) {
     effects.emplace_back(MemoryEffects::Read::get(), &getAMutable(),
                          SharedMemory::get());
@@ -488,10 +502,12 @@ void TMEMAllocOp::getEffects(
   // op.
   if (!getType().getMutableMemory() && !op->hasAttr("tensor_memory_col_offset"))
     return;
-  effects.emplace_back(MemoryEffects::Allocate::get(), TensorMemory::get());
+  OpResult alloc = getOperation()->getOpResult(0);
+  effects.emplace_back(MemoryEffects::Allocate::get(), alloc,
+                       TensorMemory::get());
   if (getSrc())
-    effects.emplace_back(MemoryEffects::Write::get(),
-                         getOperation()->getOpResult(0), TensorMemory::get());
+    effects.emplace_back(MemoryEffects::Write::get(), alloc,
+                         TensorMemory::get());
 }
 
 // -- TMEMCopyOp --
 
@@ -1,5 +1,6 @@
 add_triton_library(TritonNvidiaGPUTransforms
   FenceInsertion.cpp
+  InterleaveTMem.cpp
   MMALowering.cpp
   OptimizeDescriptorEncoding.cpp
   OptimizeTMemLayouts.cpp
 
@@ -0,0 +1,260 @@
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
+#include "llvm/ADT/AddressRanges.h"
+
+namespace {
+
+using namespace mlir;
+
+namespace ttng = triton::nvidia_gpu;
+namespace ttg = triton::gpu;
+namespace tt = triton;
+
+#define GEN_PASS_CLASSES
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h.inc"
+
+// If we don't know the effects of the op, we add all possible effects.
+void addAllValuelessEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Read>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Write>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Allocate>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Free>());
+}
+
+bool collectEffects(Operation *op,
+                    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  // Collect effect instances the operation. Note that the implementation of
+  // getEffects erases all effect instances that have the type other than the
+  // template parameter so we collect them first in a local buffer and then
+  // copy.
+  if (auto iface = dyn_cast<MemoryEffectOpInterface>(op)) {
+    SmallVector<MemoryEffects::EffectInstance> localEffects;
+    iface.getEffects(localEffects);
+    llvm::append_range(effects, localEffects);
+    return true;
+  }
+  if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
+    for (auto &region : op->getRegions()) {
+      for (auto &block : region) {
+        for (auto &innerOp : block)
+          if (!collectEffects(&innerOp, effects))
+            return false;
+      }
+    }
+    return true;
+  }
+
+  // We need to be conservative here in case the op doesn't have the interface
+  // and assume it can have any possible effect.
+  addAllValuelessEffects(effects);
+  return false;
+}
+
+struct AccessRange {
+  SmallVector<std::optional<llvm::AddressRange>> ranges;
+  unsigned rankOffset = 0;
+};
+
+// Simple local alias analysis that looks for a single underlying allocation and
+// an access subrange.
+std::pair<Value, AccessRange> findBufferAccess(Value a) {
+  // Handle block arguments.
+  if (auto arg = dyn_cast<BlockArgument>(a)) {
+    Operation *parentOp = arg.getOwner()->getParentOp();
+
+    // Look through `ttg.warp_specialize` explicit captures.
+    if (auto wsOp = dyn_cast<ttg::WarpSpecializePartitionsOp>(parentOp)) {
+      return findBufferAccess(
+          wsOp.getParentOp().getExplicitCaptures()[arg.getArgNumber()]);
+    }
+
+    // Unknown block argument.
+    return {};
+  }
+
+  Operation *defOp = a.getDefiningOp();
+  // Accessing the alloc accesses the whole buffer.
+  if (auto alloc = dyn_cast<ttng::TMEMAllocOp>(defOp)) {
+    AccessRange access;
+    for (uint64_t dim : alloc.getType().getShape())
+      access.ranges.push_back({{0, dim}});
+    return {a, std::move(access)};
+  }
+
+  // Trans and Reshape views don't change the access size.
+  if (isa<ttg::MemDescTransOp, ttg::MemDescReshapeOp>(defOp)) {
+    return findBufferAccess(defOp->getOperand(0));
+  }
+
+  // Subviews can reduce the access sizes.
+  if (auto subview = dyn_cast<ttg::MemDescSubviewOp>(defOp)) {
+    auto [alloc, parentAccess] = findBufferAccess(subview.getSrc());
+    if (!alloc)
+      return {};
+    // Handle subview of a subview. The first `rankOffset` access sizes are
+    // the same as in the parent access.
+    AccessRange childAccess;
+    for (auto i : llvm::seq(parentAccess.rankOffset))
+      childAccess.ranges.push_back(parentAccess.ranges[i]);
+
+    // The subview may have a smaller rank, in which case its access size is
+    // just 1 for the higher dims.
+    childAccess.rankOffset =
+        subview.getSrc().getType().getRank() - subview.getType().getRank();
+    for (auto [i, offset] : llvm::enumerate(subview.getOffsets())) {
+      auto parentRange = parentAccess.ranges[i + parentAccess.rankOffset];
+      if (!parentRange) {
+        childAccess.ranges.push_back({});
+        continue;
+      }
+
+      // If the offset is not known, then the entire dim may be accessed.
+      APInt value;
+      if (!matchPattern(offset, m_ConstantInt(&value))) {
+        childAccess.ranges.push_back({});
+        continue;
+      }
+
+      uint64_t accessStart = parentRange->start() + value.getSExtValue();
+      uint64_t accessSize = 1;
+      if (i >= childAccess.rankOffset)
+        accessSize = subview.getType().getShape()[i - childAccess.rankOffset];
+      childAccess.ranges.push_back({{accessStart, accessStart + accessSize}});
+    }
+    return {alloc, std::move(childAccess)};
+  }
+
+  // Subslice is a subview only on the N dimension.
+  if (auto subslice = dyn_cast<ttng::TMEMSubSliceOp>(defOp)) {
+    auto [alloc, parentAccess] = findBufferAccess(subslice.getSrc());
+    if (!alloc)
+      return {};
+    if (!parentAccess.ranges[1])
+      return {alloc, parentAccess};
+    uint64_t mStart = parentAccess.ranges[1]->start() + subslice.getN();
+    uint64_t mSize = subslice.getType().getShape()[1];
+    AccessRange childAccess = parentAccess;
+    childAccess.ranges[1] = {{mStart, mStart + mSize}};
+    return {alloc, std::move(childAccess)};
+  }
+
+  // Unknown defining op.
+  return {};
+}
+
+bool tmemMayAlias(Value a, Value b) {
+  auto [aAlloc, aRanges] = findBufferAccess(a);
+  auto [bAlloc, bRanges] = findBufferAccess(b);
+  // If the underlying buffer was not identified, assume mayalias.
+  if (!aAlloc || !bAlloc)
+    return true;
+  // If the buffers are different, they don't alias.
+  if (aAlloc != bAlloc)
+    return false;
+  // If the access ranges along any dimension are known to not overlap, then the
+  // accesses don't alias.
+  for (auto [aRange, bRange] : llvm::zip(aRanges.ranges, bRanges.ranges)) {
+    // If either access range at this dim is unknown, we can't determine if they
+    // don't overlap.
+    if (!aRange || !bRange)
+      continue;
+    // The access ranges are known and don't overlap.
+    if (!aRange->intersects(*bRange))
+      return false;
+  }
+  return true;
+}
+
+// Sink tmem_loads as close to their use as possible to reduce register
+// pressure.
+bool sinkOps(Value buffer, ArrayRef<Operation *> useChain) {
+  Operation *insertBefore = nullptr;
+  Operation *next = useChain.back()->getNextNode();
+  while (next && !next->hasTrait<OpTrait::IsTerminator>()) {
+    insertBefore = next;
+    bool dep = false;
+    for (auto operand : getNestedOperands(next)) {
+      if (llvm::any_of(useChain, [&](Operation *op) {
+            return llvm::is_contained(op->getResults(), operand);
+          })) {
+        dep = true;
+        break;
+      }
+    }
+    // Don't sink past barrier signals, since they may guard the liverange
+    // of the buffer.
+    if (isa<ttng::ArriveBarrierOp>(next))
+      break;
+    if (!isMemoryEffectFree(next)) {
+      SmallVector<MemoryEffects::EffectInstance> effects;
+      collectEffects(next, effects);
+      for (auto effect : effects) {
+        // Look for potentially aliasing write or free effects.
+        if (!isa<MemoryEffects::Write, MemoryEffects::Free>(effect.getEffect()))
+          continue;
+        if (isa<SideEffects::DefaultResource>(effect.getResource())) {
+          dep = true;
+          break;
+        }
+        if (isa<ttng::TensorMemory>(effect.getResource()) &&
+            (!effect.getValue() || tmemMayAlias(effect.getValue(), buffer))) {
+          dep = true;
+          break;
+        }
+      }
+    }
+    if (dep)
+      break;
+    next = next->getNextNode();
+  }
+  if (insertBefore && insertBefore != useChain.back()->getNextNode()) {
+    for (Operation *op : useChain)
+      op->moveBefore(insertBefore);
+    return true;
+  }
+  return false;
+}
+
+// Try to sink a load and a collection of its users.
+bool trySinkOp(Operation *op, Value buffer) {
+  SmallVector<Operation *> useChain{op};
+  while (useChain.back()->hasOneUse() &&
+         isPure(*useChain.back()->user_begin()) &&
+         useChain.back()->getNextNode() == *useChain.back()->user_begin()) {
+    useChain.push_back(*useChain.back()->user_begin());
+  }
+  return sinkOps(buffer, useChain);
+}
+
+struct TritonNvidiaGPUInterleaveTMemPass
+    : public TritonNvidiaGPUInterleaveTMemPassBase<
+          TritonNvidiaGPUInterleaveTMemPass> {
+  using TritonNvidiaGPUInterleaveTMemPassBase::
+      TritonNvidiaGPUInterleaveTMemPassBase;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    ModuleOp m = getOperation();
+    SmallVector<std::pair<Operation *, Value>> opsToSink;
+    m.walk([&](Operation *op) {
+      if (auto load = dyn_cast<ttng::TMEMLoadOp>(op))
+        opsToSink.emplace_back(load, load.getSrc());
+      else if (auto alloc = dyn_cast<ttng::TMEMAllocOp>(op))
+        opsToSink.emplace_back(alloc, alloc.getResult());
+    });
+    for (auto [op, buffer] : opsToSink) {
+      while (trySinkOp(op, buffer)) {
+        // Keep trying to sink loads and their users.
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::createTritonNvidiaGPUInterleaveTMemPass() {
+  return std::make_unique<TritonNvidiaGPUInterleaveTMemPass>();
+}