Revert "[Layouts] Reuse existing rematerializations in backwards pass" (#5418)

Mogball · web-flow · commit 976c4e409841 · 2024-12-12T14:09:56.000-08:00
Reverts triton-lang/triton#5410 This is causes hangs/crashes in more complex examples. Reverting until a better solution is found since the follow-up doesn't fix the issue either.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -163,8 +163,7 @@ Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
 LogicalResult getConvertBackwardSlice(
     Value root, SetVector<Value> &slice, Attribute rootEncoding,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr,
-    std::function<Value(Value, Attribute)> getExistingConversion = nullptr);
+    std::function<bool(Operation *)> stopPropagation = nullptr);
 
 // Populate pattern to remove dead cycles in ForOp.
 void populateForOpDeadArgumentElimination(RewritePatternSet &patterns);
diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
@@ -116,13 +116,17 @@ class LayoutPropagation {
 class LayoutRematerialization {
 public:
   LayoutRematerialization(FuncOp F) : funcOp(F) {}
-
   // Map the original value to the remat'ed one.
   void addRematValue(Value old, Attribute encoding, Value newV);
-  // Get the remat'ed value in the given encoding, if one already exists and
-  // is different then the layout conversion root.
-  Value getRematValue(Value value, Attribute encoding, Value root) const;
-
+  bool hasRematValue(Value value, Attribute encoding) {
+    return rematMapping.contains({value, encoding});
+  }
+  // Return the remat'ed value in the given encoding.
+  Value getRematValue(Value value, Attribute encoding) {
+    auto it = rematMapping.find({value, encoding});
+    assert(it != rematMapping.end());
+    return it->second;
+  }
   void cleanup();
   void backwardRematerialization();
   void backwardRematerialization(ConvertLayoutOp convertOp);
@@ -133,11 +137,6 @@ class LayoutRematerialization {
   void rewriteSlice(SetVector<Value> &slice, DenseMap<Value, Attribute> &layout,
                     ConvertLayoutOp convertOp);
 
-  LogicalResult getRematerializableSlice(
-      Value root, Attribute rootEncoding, SetVector<Value> &slice,
-      DenseMap<Value, Attribute> &layout,
-      std::function<bool(Operation *)> stopPropagation = nullptr);
-
 private:
   void updateRematMapping(SmallVector<std::tuple<Value, Value>> &values);
   // Existing tuples of (value, layout) that needs to be updated when recreating
@@ -158,21 +157,6 @@ void LayoutRematerialization::addRematValue(Value old, Attribute encoding,
   mappedValues[old] = encoding;
 }
 
-Value LayoutRematerialization::getRematValue(Value value, Attribute encoding,
-                                             Value root) const {
-  Value remat = rematMapping.lookup({value, encoding});
-  if (!remat)
-    return {};
-  // If the remat'ed value is a conversion result, make sure it is different
-  // than the root of the one we're looking at.
-  if (auto cvt = remat.getDefiningOp<ConvertLayoutOp>()) {
-    if (cvt.getSrc() == root)
-      return {};
-  }
-  // This remat'ed value can be reused.
-  return remat;
-}
-
 // Remove unneeded values now that we are done with the rematMapping.
 void LayoutRematerialization::cleanup() {
   for (Operation *op : llvm::reverse(opToDelete))
@@ -794,8 +778,8 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
     auto layoutIt = layout.find(v);
     assert(layoutIt != layout.end());
     // If we already have a remat value for this value, use it.
-    if (Value remat = getRematValue(v, layoutIt->second, convertOp.getSrc())) {
-      mapping.map(v, remat);
+    if (hasRematValue(v, layoutIt->second)) {
+      mapping.map(v, getRematValue(v, layoutIt->second));
       valuesWithExistingRemat.insert(v);
       continue;
     }
@@ -956,17 +940,12 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-LogicalResult LayoutRematerialization::getRematerializableSlice(
+LogicalResult getRematerializableSlice(
     Value root, Attribute rootEncoding, SetVector<Value> &slice,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation) {
-  // Allow re-using existing conversions for a value.
-  auto getExistingConversion = [&](Value value, Attribute encoding) -> Value {
-    return getRematValue(value, encoding, root);
-  };
-  LogicalResult result =
-      getConvertBackwardSlice(root, slice, rootEncoding, layout,
-                              stopPropagation, getExistingConversion);
+    std::function<bool(Operation *)> stopPropagation = nullptr) {
+  LogicalResult result = getConvertBackwardSlice(root, slice, rootEncoding,
+                                                 layout, stopPropagation);
   if (result.failed() || slice.empty())
     return failure();
 
@@ -983,14 +962,8 @@ LogicalResult LayoutRematerialization::getRematerializableSlice(
 void LayoutRematerialization::backwardRematerialization() {
   // Go through each ConvertLayoutOp.
   SmallVector<ConvertLayoutOp> convertOps;
-  funcOp.walk([&](ConvertLayoutOp convertOp) {
-    convertOps.push_back(convertOp);
-    // Add existing layout conversions as rematerializations of themselves. This
-    // enables rematerialization of other conversions to re-use existing
-    // conversions. Importantly, don't add them to `mappedValues`.
-    rematMapping.insert(
-        {{convertOp.getSrc(), convertOp.getType().getEncoding()}, convertOp});
-  });
+  funcOp.walk(
+      [&](ConvertLayoutOp convertOp) { convertOps.push_back(convertOp); });
   for (ConvertLayoutOp convertOp : convertOps) {
     backwardRematerialization(convertOp);
   }
@@ -1015,13 +988,14 @@ void LayoutRematerialization::backwardRematerialization(
   // careful with the heuristics for both correctness and perf
   if (isa<DotOperandEncodingAttr, LinearEncodingAttr>(targetType.getEncoding()))
     return;
-  Value oldV = convertOp.getSrc();
+  Value oldV = convertOp->getOperand(0);
   LDBG("check backward remat with source " << oldV << " encoding "
                                            << targetType.getEncoding());
   // Check to see if there are existing remat'ed values for the pair of oldValue
   // and encoding.
-  if (Value newV = getRematValue(oldV, targetType.getEncoding(), oldV)) {
+  if (hasRematValue(oldV, targetType.getEncoding())) {
     // Replace it with the remat'ed value.
+    Value newV = getRematValue(oldV, targetType.getEncoding());
     convertOp.replaceAllUsesWith(newV);
     opToDelete.insert(convertOp);
     LDBG("found remat'ed value" << newV);
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -770,11 +770,11 @@ static bool isFreeConvert(Operation *op) {
                               convertOp.getType());
 }
 
-LogicalResult getConvertBackwardSlice(
-    Value root, SetVector<Value> &slice, Attribute rootEncoding,
-    DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation,
-    std::function<Value(Value, Attribute)> getExistingConversion) {
+LogicalResult
+getConvertBackwardSlice(Value root, SetVector<Value> &slice,
+                        Attribute rootEncoding,
+                        DenseMap<Value, Attribute> &layout,
+                        std::function<bool(Operation *)> stopPropagation) {
   DenseSet<std::pair<Value, Attribute>> seen;
   SmallVector<std::pair<Value, Attribute>> queue;
 
@@ -814,12 +814,6 @@ LogicalResult getConvertBackwardSlice(
 
       continue;
     }
-    Value existing;
-    if (getExistingConversion &&
-        (existing = getExistingConversion(currentValue, encoding))) {
-      enqueue(existing, encoding);
-      continue;
-    }
     if (auto *definingOp = currentValue.getDefiningOp()) {
       // If the op has multiple results we need to update all results layout.
       for (Value result : definingOp->getResults()) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -14,7 +14,6 @@ configure_lit_site_cfg(
 set(TRITON_TEST_DEPENDS
   triton-opt
   triton-tensor-layout
-  triton-llvm-opt
 )
 
 set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
@@ -2427,9 +2427,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     %2 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<64x64x!tt.ptr<f16>, #blocked2>
     %3 = tt.splat %arg1 : !tt.ptr<i8> -> tensor<128x64x!tt.ptr<i8>, #blocked>
     %4 = tt.splat %arg1 : !tt.ptr<i8> -> tensor<128x64x!tt.ptr<i8>, #blocked>
-    // CHECK: %[[F:.+]]:3 = scf.for {{.*}} -> (tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>)
-    // CHECK-COUNT-4: convert_layout
-    // CHECK:   scf.yield {{.*}} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    // CHECK: %[[F:.+]]:3 = scf.for {{.*}} -> (tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>)
+    // CHECK:   scf.yield {{.*}} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>
     // CHECK: }
     // CHECK: tt.return %[[F]]#0, %[[F]]#1 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
      %5:3 = scf.for %arg2 = %c0_i32 to %c2048_i32 step %c64_i32 iter_args(%arg3 = %cst_2, %arg4 = %cst, %arg5 = %cst_0) -> (tensor<128x64xf32, #mma>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>)  : i32 {
@@ -2773,27 +2772,3 @@ tt.func @do_not_remat(%arg0: tensor<64x64xf32, #blocked1>) -> tensor<1x64xf32, #
 }
 
 }
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
-#blocked1 = #ttg.blocked<{sizePerThread = [2, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
-
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
-
-// CHECK-LABEL: reuse_layout_conversion
-tt.func @reuse_layout_conversion(%arg0: tensor<64x64xf32, #blocked>) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) {
-  // CHECK-NEXT: %cst = arith.constant {{.*}} tensor<64x64xf32, #blocked>
-  %cst = arith.constant dense<2.000000e+00> : tensor<64x64xf32, #blocked1>
-  // CHECK-NEXT: [[TRANS:%.*]] = tt.trans %arg0 {{.*}} tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1>
-  %0 = tt.trans %arg0 {order = array<i32: 1, 0>} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1>
-  // CHECK-NEXT: [[CVT:%.*]] = ttg.convert_layout [[TRANS]] : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked>
-  %1 = ttg.convert_layout %0 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked>
-  // CHECK-NEXT: [[RESULT:%.*]] = arith.mulf [[CVT]], %cst : tensor<64x64xf32, #blocked>
-  %2 = arith.mulf %0, %cst : tensor<64x64xf32, #blocked1>
-  %3 = ttg.convert_layout %2 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked>
-  // CHECK-NEXT: return [[CVT]], [[RESULT]]
-  tt.return %1, %3 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>
-}
-
-}

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,6 @@ configure_lit_site_cfg(`
`14`	`14`	`set(TRITON_TEST_DEPENDS`
`15`	`15`	`triton-opt`
`16`	`16`	`triton-tensor-layout`
`17`		`- triton-llvm-opt`
`18`	`17`	`)`
`19`	`18`
`20`	`19`	`set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")`