[Layouts] Reuse existing rematerializations in backwards pass (#5410)

Mogball · web-flow · commit 82fec379583e · 2024-12-11T23:44:59.000-08:00
This PR enables reusing existing `convert_layout` ops in the backwards
pass if they didn't get removed through some other means. This enables
the compiler to remove some tricky layout conversions by recognizing
that the same computations can be reconstructed using other layout
conversions.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -163,7 +163,8 @@ Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
 LogicalResult getConvertBackwardSlice(
     Value root, SetVector<Value> &slice, Attribute rootEncoding,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr);
+    std::function<bool(Operation *)> stopPropagation = nullptr,
+    std::function<Value(Value, Attribute)> getExistingConversion = nullptr);
 
 // Populate pattern to remove dead cycles in ForOp.
 void populateForOpDeadArgumentElimination(RewritePatternSet &patterns);
diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
@@ -116,17 +116,13 @@ class LayoutPropagation {
 class LayoutRematerialization {
 public:
   LayoutRematerialization(FuncOp F) : funcOp(F) {}
+
   // Map the original value to the remat'ed one.
   void addRematValue(Value old, Attribute encoding, Value newV);
-  bool hasRematValue(Value value, Attribute encoding) {
-    return rematMapping.contains({value, encoding});
-  }
-  // Return the remat'ed value in the given encoding.
-  Value getRematValue(Value value, Attribute encoding) {
-    auto it = rematMapping.find({value, encoding});
-    assert(it != rematMapping.end());
-    return it->second;
-  }
+  // Get the remat'ed value in the given encoding, if one already exists and
+  // is different then the layout conversion root.
+  Value getRematValue(Value value, Attribute encoding, Value root) const;
+
   void cleanup();
   void backwardRematerialization();
   void backwardRematerialization(ConvertLayoutOp convertOp);
@@ -137,6 +133,11 @@ class LayoutRematerialization {
   void rewriteSlice(SetVector<Value> &slice, DenseMap<Value, Attribute> &layout,
                     ConvertLayoutOp convertOp);
 
+  LogicalResult getRematerializableSlice(
+      Value root, Attribute rootEncoding, SetVector<Value> &slice,
+      DenseMap<Value, Attribute> &layout,
+      std::function<bool(Operation *)> stopPropagation = nullptr);
+
 private:
   void updateRematMapping(SmallVector<std::tuple<Value, Value>> &values);
   // Existing tuples of (value, layout) that needs to be updated when recreating
@@ -157,6 +158,21 @@ void LayoutRematerialization::addRematValue(Value old, Attribute encoding,
   mappedValues[old] = encoding;
 }
 
+Value LayoutRematerialization::getRematValue(Value value, Attribute encoding,
+                                             Value root) const {
+  Value remat = rematMapping.lookup({value, encoding});
+  if (!remat)
+    return {};
+  // If the remat'ed value is a conversion result, make sure it is different
+  // than the root of the one we're looking at.
+  if (auto cvt = remat.getDefiningOp<ConvertLayoutOp>()) {
+    if (cvt.getSrc() == root)
+      return {};
+  }
+  // This remat'ed value can be reused.
+  return remat;
+}
+
 // Remove unneeded values now that we are done with the rematMapping.
 void LayoutRematerialization::cleanup() {
   for (Operation *op : llvm::reverse(opToDelete))
@@ -766,8 +782,8 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
     auto layoutIt = layout.find(v);
     assert(layoutIt != layout.end());
     // If we already have a remat value for this value, use it.
-    if (hasRematValue(v, layoutIt->second)) {
-      mapping.map(v, getRematValue(v, layoutIt->second));
+    if (Value remat = getRematValue(v, layoutIt->second, convertOp.getSrc())) {
+      mapping.map(v, remat);
       valuesWithExistingRemat.insert(v);
       continue;
     }
@@ -928,12 +944,17 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-LogicalResult getRematerializableSlice(
+LogicalResult LayoutRematerialization::getRematerializableSlice(
     Value root, Attribute rootEncoding, SetVector<Value> &slice,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr) {
-  LogicalResult result = getConvertBackwardSlice(root, slice, rootEncoding,
-                                                 layout, stopPropagation);
+    std::function<bool(Operation *)> stopPropagation) {
+  // Allow re-using existing conversions for a value.
+  auto getExistingConversion = [&](Value value, Attribute encoding) -> Value {
+    return getRematValue(value, encoding, root);
+  };
+  LogicalResult result =
+      getConvertBackwardSlice(root, slice, rootEncoding, layout,
+                              stopPropagation, getExistingConversion);
   if (result.failed() || slice.empty())
     return failure();
 
@@ -950,8 +971,14 @@ LogicalResult getRematerializableSlice(
 void LayoutRematerialization::backwardRematerialization() {
   // Go through each ConvertLayoutOp.
   SmallVector<ConvertLayoutOp> convertOps;
-  funcOp.walk(
-      [&](ConvertLayoutOp convertOp) { convertOps.push_back(convertOp); });
+  funcOp.walk([&](ConvertLayoutOp convertOp) {
+    convertOps.push_back(convertOp);
+    // Add existing layout conversions as rematerializations of themselves. This
+    // enables rematerialization of other conversions to re-use existing
+    // conversions. Importantly, don't add them to `mappedValues`.
+    rematMapping.insert(
+        {{convertOp.getSrc(), convertOp.getType().getEncoding()}, convertOp});
+  });
   for (ConvertLayoutOp convertOp : convertOps) {
     backwardRematerialization(convertOp);
   }
@@ -976,14 +1003,13 @@ void LayoutRematerialization::backwardRematerialization(
   // careful with the heuristics for both correctness and perf
   if (isa<DotOperandEncodingAttr, LinearEncodingAttr>(targetType.getEncoding()))
     return;
-  Value oldV = convertOp->getOperand(0);
+  Value oldV = convertOp.getSrc();
   LDBG("check backward remat with source " << oldV << " encoding "
                                            << targetType.getEncoding());
   // Check to see if there are existing remat'ed values for the pair of oldValue
   // and encoding.
-  if (hasRematValue(oldV, targetType.getEncoding())) {
+  if (Value newV = getRematValue(oldV, targetType.getEncoding(), oldV)) {
     // Replace it with the remat'ed value.
-    Value newV = getRematValue(oldV, targetType.getEncoding());
     convertOp.replaceAllUsesWith(newV);
     opToDelete.insert(convertOp);
     LDBG("found remat'ed value" << newV);
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -757,11 +757,11 @@ static bool isFreeConvert(Operation *op) {
                               convertOp.getType());
 }
 
-LogicalResult
-getConvertBackwardSlice(Value root, SetVector<Value> &slice,
-                        Attribute rootEncoding,
-                        DenseMap<Value, Attribute> &layout,
-                        std::function<bool(Operation *)> stopPropagation) {
+LogicalResult getConvertBackwardSlice(
+    Value root, SetVector<Value> &slice, Attribute rootEncoding,
+    DenseMap<Value, Attribute> &layout,
+    std::function<bool(Operation *)> stopPropagation,
+    std::function<Value(Value, Attribute)> getExistingConversion) {
   DenseSet<std::pair<Value, Attribute>> seen;
   SmallVector<std::pair<Value, Attribute>> queue;
 
@@ -802,6 +802,12 @@ getConvertBackwardSlice(Value root, SetVector<Value> &slice,
 
       continue;
     }
+    Value existing;
+    if (getExistingConversion &&
+        (existing = getExistingConversion(currentValue, encoding))) {
+      enqueue(existing, encoding);
+      continue;
+    }
     if (auto *definingOp = currentValue.getDefiningOp()) {
       // If the op has multiple results we need to update all results layout.
       for (Value result : definingOp->getResults()) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -14,6 +14,7 @@ configure_lit_site_cfg(
 set(TRITON_TEST_DEPENDS
   triton-opt
   triton-tensor-layout
+  triton-llvm-opt
 )
 
 set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
@@ -2427,8 +2427,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     %2 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<64x64x!tt.ptr<f16>, #blocked2>
     %3 = tt.splat %arg1 : !tt.ptr<i8> -> tensor<128x64x!tt.ptr<i8>, #blocked>
     %4 = tt.splat %arg1 : !tt.ptr<i8> -> tensor<128x64x!tt.ptr<i8>, #blocked>
-    // CHECK: %[[F:.+]]:3 = scf.for {{.*}} -> (tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>)
-    // CHECK:   scf.yield {{.*}} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>
+    // CHECK: %[[F:.+]]:3 = scf.for {{.*}} -> (tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>)
+    // CHECK-COUNT-4: convert_layout
+    // CHECK:   scf.yield {{.*}} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
     // CHECK: }
     // CHECK: tt.return %[[F]]#0, %[[F]]#1 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
      %5:3 = scf.for %arg2 = %c0_i32 to %c2048_i32 step %c64_i32 iter_args(%arg3 = %cst_2, %arg4 = %cst, %arg5 = %cst_0) -> (tensor<128x64xf32, #mma>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>)  : i32 {
@@ -2728,3 +2729,27 @@ tt.func @propagate_layout_gather(%arg0: tensor<1024x256xi32, #blocked>, %arg1: t
 }
 
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [2, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+
+// CHECK-LABEL: reuse_layout_conversion
+tt.func @reuse_layout_conversion(%arg0: tensor<64x64xf32, #blocked>) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) {
+  // CHECK-NEXT: %cst = arith.constant {{.*}} tensor<64x64xf32, #blocked>
+  %cst = arith.constant dense<2.000000e+00> : tensor<64x64xf32, #blocked1>
+  // CHECK-NEXT: [[TRANS:%.*]] = tt.trans %arg0 {{.*}} tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1>
+  %0 = tt.trans %arg0 {order = array<i32: 1, 0>} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1>
+  // CHECK-NEXT: [[CVT:%.*]] = ttg.convert_layout [[TRANS]] : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked>
+  %1 = ttg.convert_layout %0 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked>
+  // CHECK-NEXT: [[RESULT:%.*]] = arith.mulf [[CVT]], %cst : tensor<64x64xf32, #blocked>
+  %2 = arith.mulf %0, %cst : tensor<64x64xf32, #blocked1>
+  %3 = ttg.convert_layout %2 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked>
+  // CHECK-NEXT: return [[CVT]], [[RESULT]]
+  tt.return %1, %3 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>
+}
+
+}

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ configure_lit_site_cfg(`
`14`	`14`	`set(TRITON_TEST_DEPENDS`
`15`	`15`	`triton-opt`
`16`	`16`	`triton-tensor-layout`
	`17`	`+ triton-llvm-opt`
`17`	`18`	`)`
`18`	`19`
`19`	`20`	`set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")`