[LAYOUTS] Remove HoistLayoutConversion in favour of backwardsRemat (#5788)

lezcano · web-flow · commit 1e0e51c4aeb3 · 2025-02-03T18:23:08.000Z
Reland of triton-lang/triton#5673
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -581,6 +581,7 @@ def TT_TransOp : TT_Op<"trans", [Pure,
     let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
 
     let hasFolder = 1;
+    let hasVerifier = 1;
 }
 
 //
@@ -830,7 +831,8 @@ def TT_MakeRangeOp : TT_Op<"make_range", [Pure]> {
 def TT_ElementwiseInlineAsmOp : TT_Op<"elementwise_inline_asm", [
   Elementwise,
   SameOperandsAndResultEncoding,
-  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  DeclareOpInterfaceMethods<ConditionallySpeculatable>
 ]> {
   let summary = "inline assembly applying an elementwise operation to a group of packed elements.";
   let description = [{
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -225,7 +225,7 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure]> {
 
 def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure,
                                                   TransposeOpInterface,
-                                                  DeclareOpInterfaceMethods<InferTypeOpInterface>,
+                                                  InferTypeOpWithLayoutEquivalence,
                                                   SameOperandsAndResultElementType]> {
   let summary = "transpose the descriptor";
 
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -209,6 +209,23 @@ OpFoldResult TransOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
+LogicalResult TransOp::verify() {
+  auto order = getOrder();
+  auto srcTy = cast<RankedTensorType>(getSrc().getType());
+  if (order.size() != srcTy.getShape().size()) {
+    return emitError("order must have the same size as the source tensor");
+  }
+  if (!isPermutationOfIota(order)) {
+    return emitError("order must be a permutation of 0..n-1");
+  }
+  SmallVector<int64_t> retShape = applyPermutation(srcTy.getShape(), order);
+  if (retShape != getType().getShape()) {
+    return emitError(
+        "result shape must match the permutation of the source shape");
+  }
+  return success();
+}
+
 LogicalResult TransOp::inferReturnTypes(
     MLIRContext *context, std::optional<Location> location,
     TransOp::Adaptor adaptor, SmallVectorImpl<Type> &inferredReturnTypes) {
@@ -1037,6 +1054,12 @@ void ElementwiseInlineAsmOp::getEffects(
                        SideEffects::DefaultResource::get());
 }
 
+Speculation::Speculatability ElementwiseInlineAsmOp::getSpeculatability() {
+  if (getPure())
+    return Speculation::Speculatable;
+  return Speculation::NotSpeculatable;
+}
+
 LogicalResult ElementwiseInlineAsmOp::verify() {
   if (getNumOperands() >= 1) {
     auto tensorType = dyn_cast<RankedTensorType>(getOperand(0).getType());
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -454,15 +454,17 @@ OpFoldResult MemDescTransOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
-LogicalResult MemDescTransOp::inferReturnTypes(
-    MLIRContext *context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
+LogicalResult
+MemDescTransOp::inferReturnTypes(MLIRContext *context,
+                                 std::optional<Location> location,
+                                 MemDescTransOp::Adaptor adaptor,
+                                 SmallVectorImpl<Type> &inferredReturnTypes) {
+
   // type is the same as the input
-  auto argTy = cast<MemDescType>(operands[0].getType());
-  auto argShape = argTy.getShape();
-  auto order = properties.as<Properties *>()->order.asArrayRef();
-  SmallVector<int64_t> retShape = applyPermutation(argTy.getShape(), order);
+  auto argTy = cast<MemDescType>(adaptor.getSrc().getType());
+  auto shape = argTy.getShape();
+  auto order = adaptor.getOrder();
+  SmallVector<int64_t> retShape = applyPermutation(shape, order);
 
   auto retEltTy = argTy.getElementType();
   Attribute argEncoding = argTy.getEncoding();
@@ -471,17 +473,17 @@ LogicalResult MemDescTransOp::inferReturnTypes(
     Dialect &dialect = argEncoding.getDialect();
     auto inferLayoutInterface = cast<DialectInferLayoutInterface>(&dialect);
     if (inferLayoutInterface
-            ->inferTransOpEncoding(argEncoding, argShape, order, retEncoding)
+            ->inferTransOpEncoding(argEncoding, shape, order, retEncoding)
             .failed()) {
       return failure();
     }
   }
-  auto memDescTy = cast<MemDescType>(argTy);
-  inferredReturnTypes.push_back(MemDescType::get(
-      retShape, retEltTy, retEncoding, memDescTy.getMemorySpace(),
-      memDescTy.getMutableMemory()));
+  inferredReturnTypes.push_back(
+      MemDescType::get(retShape, retEltTy, retEncoding, argTy.getMemorySpace(),
+                       argTy.getMutableMemory()));
   return success();
 }
+
 // LocalAllocOp
 void LocalAllocOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -24,36 +24,6 @@ namespace {
 // Roughly, whether op is elementwise and thus threads don't need
 // to exchange elements. But some ops are not currently supported even though
 // they meet that criterion.
-bool canHoistDotOpEncV2(Operation *op, DotOperandEncodingAttr &dotOpEnc) {
-  // Only consider custom conversions or arith ops.
-  // TODO(jlebar): Is this too restrictive?
-  if (!isa<FpToFpOp, BitcastOp>(op) && !isPureUnaryInlineAsm(op) &&
-      !isa<arith::ArithDialect>(op->getDialect()))
-    return false;
-
-  // Quick handling to fix loading issues when computing the original
-  // bitwidth is unable to realize that there is a mixed-precision dot
-  // (hence kWidth = 1) but wants to hoist through the type conversion.
-  if (isa<arith::ExtFOp>(op) && dotOpEnc.getKWidth() == 1)
-    return false;
-
-  // Currently, these instructions are not supported during lowering of
-  // shared -> dot_operand layout. Not all types and type conversions are
-  // supported.
-  if (isa<arith::TruncIOp, arith::TruncFOp, arith::SelectOp>(op))
-    return false;
-
-  // Don't hoist through u1 -> fp casts as they aren't supported in
-  // ElementwiseOpToLLVM::reorderValues().
-  if (isa<arith::UIToFPOp>(op)) {
-    Type opType = getElementTypeOrSelf(op->getOperand(0));
-    if (opType.isInteger(1))
-      return false;
-  }
-
-  return true;
-}
-
 // Analog of canHoistDotOpEncV2, but for MMAv3 (WGMMA where operand A
 // is in registers).
 bool canHoistDotOpEncV3(Operation *op) {
@@ -198,116 +168,6 @@ class SwizzleShmemConvert : public OpRewritePattern<ConvertLayoutOp> {
   }
 };
 
-// Move convert-to-dot-operand "up" past elementwise ops:
-//
-//  convert(elementwise(x)) #dot_operand ->
-//  elementwise(convert(x, #dot_operand)).
-//
-// The goal is to put the convert right next to the originating load.  If we can
-// accomplish this, then we can save a shmem round-trip:
-//
-//   Before:
-//
-//     - Load from global into shmem using an async copy.
-//     - Load from shmem into a #blocked layout.
-//     - Do elementwise ops over #blocked layout.
-//     - Convert to #dot_operand (round-trip through shmem).
-//     - Do dot.
-//
-//   After:
-//
-//     - Load from global into shmem using an async copy (same as before).
-//     - Load from shmem into a #dot_operand layout.
-//     - Do elementwise ops over #dot_operand layout.
-//     - Do dot.
-//
-// This can also be propagated when we have a constant, instead of a load.
-//
-// Eliminating the shmem round-trip is such a big win, we're willing to do it
-// even if this duplicates work because some of the elementwise ops have uses
-// that don't flow into the dot.  On the other hand, we only want to do this if
-// we can in fact reduce shmem round-trips: For example, simply moving a convert
-// up above e.g. an `add` now means we have *two* converts.  That's worse,
-// unless we can continue moving the converts upwards and eventually merge them.
-// So we try to check that this will be beneficial before making any changes.
-class HoistLayoutConversion : public OpRewritePattern<ConvertLayoutOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ConvertLayoutOp cvt,
-                                PatternRewriter &rewriter) const override {
-    // Only consider conversions to dot operand.
-    auto cvtTy = cast<RankedTensorType>(cvt.getType());
-    auto dotOpEnc = dyn_cast<DotOperandEncodingAttr>(cvtTy.getEncoding());
-    if (!dotOpEnc)
-      return failure();
-
-    auto src = cvt.getSrc().getDefiningOp();
-    if (!src || src->getNumOperands() == 0 || src->getNumResults() != 1)
-      return failure();
-
-    auto srcTy = dyn_cast<RankedTensorType>(src->getResult(0).getType());
-    if (!srcTy)
-      return failure();
-
-    if (!all_of(src->getOperandTypes(),
-                [](Type ty) { return isa<RankedTensorType>(ty); }))
-      return failure();
-
-    if (!canHoistDotOpEncV2(src, dotOpEnc))
-      return failure();
-
-    // Check that the conversion is transitively dependent on a load or a
-    // constant, and all operations between it and the convert are layout
-    // preserving.
-    //
-    // TODO(jlebar): This is accidentally quadratic; we iterate over the whole
-    // slice but then at the end we only modify one op!
-    SetVector<Operation *> slice;
-    BackwardSliceOptions opt;
-    opt.omitBlockArguments = true;
-    getBackwardSlice(cvt.getOperation(), &slice, opt);
-
-    // TODO(jlebar): This is too conservative when there are multiple loads in
-    // the chain. If one of the loads has a non-layout-preserving op and the
-    // other does not, then we may or may not accept the chain, depending on
-    // which load gets hit first by getBackwardSlice. For example:
-    // cvt(broadcast(load(x)) + load(y)) // accepted & load(y) will benefit.
-    // cvt(load(y) + broadcast(load(x))) // rejected & load(y) will not benefit.
-    bool foundInitializer = false;
-    // Reverse the slice so that we start directly above the convert and check
-    // that every op allows hoisting until we find a load or a constant.
-    for (Operation *currOp : llvm::reverse(slice)) {
-      if (isa<LoadOp>(currOp) || isa<arith::ConstantOp>(currOp)) {
-        foundInitializer = true;
-        break;
-      }
-      if (!canHoistDotOpEncV2(currOp, dotOpEnc))
-        return failure();
-    }
-    if (!foundInitializer)
-      return failure();
-
-    SmallVector<ConvertLayoutOp> newOperands;
-    for (auto operand : src->getOperands()) {
-      // We checked earlier that all operands are ranked tensors.
-      auto operandTy = cast<RankedTensorType>(operand.getType());
-      Type newCvtTy = RankedTensorType::get(
-          srcTy.getShape(), operandTy.getElementType(), cvtTy.getEncoding());
-      newOperands.push_back(
-          rewriter.create<ConvertLayoutOp>(cvt.getLoc(), newCvtTy, operand));
-    }
-    auto newRet = rewriter.clone(*src);
-    for (int i = 0; i < newOperands.size(); i++)
-      newRet->setOperand(i, newOperands[i]);
-    newRet->getResult(0).setType(RankedTensorType::get(
-        srcTy.getShape(), srcTy.getElementType(), cvtTy.getEncoding()));
-
-    rewriter.replaceOp(cvt, newRet->getResults());
-    return success();
-  }
-};
-
 // Rewrite
 //
 //   dot(alloc(trans() #shared1) ->
@@ -702,8 +562,6 @@ class TritonGPUOptimizeDotOperandsPass
     mlir::RewritePatternSet patterns(context);
     patterns.add<MMAV3HoistLayoutConversion>(context);
     patterns.add<SwizzleShmemConvert>(context);
-    if (this->hoistLayoutConversion.getValue())
-      patterns.add<HoistLayoutConversion>(context);
     patterns.add<FuseTransMMAV3Plus>(context);
     patterns.add<MMAV3UseRegOperand>(context);
     patterns.add<InjectTMemCopy>(context);
diff --git a/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp b/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp
@@ -31,6 +31,11 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "tritongpu-prefetch"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
 namespace mlir {
 namespace triton {
@@ -186,17 +191,23 @@ LogicalResult Prefetcher::initialize() {
     bool foundConvertFromShared = false;
     SmallVector<Value> rets;
     rets.push_back(op->getResult(0));
+    LDBG("Prefetch src: " << *op);
     while (op) {
       if (op->getNumOperands() != 1)
         break;
       if (!op->getResult(0).hasOneUse())
         break;
       rets.push_back(op->getOperand(0));
       if (auto cvt = dyn_cast<triton::gpu::LocalLoadOp>(op)) {
-        foundConvertFromShared = true;
+        // NYI for other encodings, for example if we have transpose
+        // in the chain
+        if (isa<DotOperandEncodingAttr>(cvt.getType().getEncoding()))
+          foundConvertFromShared = true;
         break;
       }
       op = op->getOperand(0).getDefiningOp();
+      if (op)
+        LDBG("op: " << *op);
     }
     std::reverse(rets.begin(), rets.end());
 
diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir