[ScaledDot] Remove SinkTranspose from ScaledDot (triton-lang#5653)

lezcano · web-flow · commit febe2a1a9612 · 2025-01-20T23:31:15.000Z
We remove the SinkTranspose. This was initially put in place to circunvent the issue of not being able to propagate the MMA layout past a transpose. This was landed in triton-lang#5403 so this pass not necessary anymore. The next step will be to get rid of the `transposeDot` part of the pass and instead integrate it into a different more generic pass that checks whether a dot operand inputs should be transposed to take advantage of the reg x shmem MMAv3 op.
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -656,110 +656,8 @@ class DecomposeScaledBlocked
   }
 };
 
-static void updateValueType(Value v, Attribute encoding,
-                            ArrayRef<int64_t> shape) {
-  auto tensorType = cast<RankedTensorType>(v.getType());
-  auto newType =
-      RankedTensorType::get(shape, tensorType.getElementType(), encoding);
-  v.setType(newType);
-}
-
-static TransOp updateUsers(Value result, const SetVector<Operation *> &slice) {
-  TransOp transOp;
-  if (llvm::any_of(result.getUsers(),
-                   [&](Operation *user) { return slice.count(user) == 0; })) {
-    OpBuilder builder(result.getContext());
-    builder.setInsertionPointAfterValue(result);
-    transOp =
-        builder.create<TransOp>(result.getLoc(), result, ArrayRef({1, 0}));
-    result.replaceUsesWithIf(transOp.getResult(), [&](OpOperand &operand) {
-      return operand.getOwner() != transOp.getOperation() &&
-             slice.count(operand.getOwner()) == 0;
-    });
-  }
-  return transOp;
-}
-
-// Sync the transpose in the IR, this is done to avoid generating convert layout
-// when we have a transpose right after a dot as mma layout cannot be propagated
-// through transpose op. Once we have layouts that can represent transposed MMA
-// we can remove this transformation.
-static void sinkTransposeOp(TransOp input) {
-  SmallVector<TransOp> queue = {input};
-  while (!queue.empty()) {
-    TransOp transOp = queue.back();
-    Value currentValue = transOp.getResult();
-    queue.pop_back();
-    mlir::ForwardSliceOptions options;
-    options.filter = [](Operation *op) {
-      if (op->hasTrait<OpTrait::Elementwise>() && op->getNumOperands() == 1)
-        return true;
-      if (isa<scf::YieldOp>(op))
-        return isa<scf::ForOp>(op->getParentOp());
-      if (isa<ConvertLayoutOp>(op))
-        return true;
-      return false;
-    };
-    SetVector<Operation *> slice;
-    mlir::getForwardSlice(currentValue, &slice, options);
-    for (Operation *op : slice) {
-      if (op->hasTrait<OpTrait::Elementwise>()) {
-        // Update users of transpose op.
-        if (op->getOperand(0) == transOp.getResult())
-          op->setOperand(0, transOp.getOperand());
-        // Update the type of the result.
-        for (Value result : op->getResults()) {
-          auto srcType = cast<RankedTensorType>(op->getOperand(0).getType());
-          updateValueType(result, srcType.getEncoding(), srcType.getShape());
-          updateUsers(result, slice);
-        }
-        continue;
-      }
-      if (auto cvtOp = dyn_cast<ConvertLayoutOp>(op)) {
-        // Update users of transpose op.
-        if (op->getOperand(0) == transOp.getResult())
-          op->setOperand(0, transOp.getOperand());
-        auto resultEncoding = cvtOp.getType().getEncoding();
-        auto newDstEncoding = inferSrcEncoding(transOp, resultEncoding);
-        assert(newDstEncoding);
-        auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
-        updateValueType(cvtOp.getResult(), newDstEncoding, srcType.getShape());
-        updateUsers(cvtOp.getResult(), slice);
-        continue;
-      }
-      assert(isa<scf::YieldOp>(op));
-      auto forOp = dyn_cast<scf::ForOp>(op->getParentOp());
-      assert(forOp);
-      for (OpOperand &operand : op->getOpOperands()) {
-        Operation *def = operand.get().getDefiningOp();
-        if (def && (slice.count(def)) || def == transOp.getOperation()) {
-          if (def == transOp.getOperation())
-            operand.set(transOp.getOperand());
-          Type newType = operand.get().getType();
-          forOp.getResult(operand.getOperandNumber()).setType(newType);
-          TransOp retTrans =
-              updateUsers(forOp.getResult(operand.getOperandNumber()), slice);
-          // Recursively try to propagate the new transpose inserted.
-          if (retTrans)
-            queue.push_back(retTrans);
-          forOp.getRegionIterArg(operand.getOperandNumber()).setType(newType);
-          TransOp argTrans = updateUsers(
-              forOp.getRegionIterArg(operand.getOperandNumber()), slice);
-          if (argTrans)
-            queue.push_back(argTrans);
-          OpBuilder builder(forOp);
-          OpOperand &init = forOp.getInitsMutable()[operand.getOperandNumber()];
-          Value initTranspose = builder.create<TransOp>(
-              forOp.getLoc(), init.get(), ArrayRef({1, 0}));
-          init.set(initTranspose);
-        }
-      }
-    }
-  }
-}
-
 // Transpose scaled_dot ops that have a scale on lhs.
-static Operation *transposeDotOp(DotScaledOp dotOp) {
+static void transposeDotOp(DotScaledOp dotOp) {
   OpBuilder builder(dotOp);
   Value lhs = dotOp.getLhs();
   std::array<int, 2> transOrder = {1, 0};
@@ -776,7 +674,6 @@ static Operation *transposeDotOp(DotScaledOp dotOp) {
       builder.create<TransOp>(result.getLoc(), result, transOrder);
   dotOp.replaceAllUsesWith(transposedResult);
   dotOp.erase();
-  return transposedResult;
 }
 
 static void transposeDots(ModuleOp m) {
@@ -787,14 +684,8 @@ static void transposeDots(ModuleOp m) {
     if (dotOp.getLhsScale() == nullptr && dotOp.getRhsScale() != nullptr)
       toTranspose.push_back(dotOp);
   });
-  SmallVector<Operation *> transposes;
   for (DotScaledOp dotOp : toTranspose) {
-    Operation *transpose = transposeDotOp(dotOp);
-    transposes.push_back(transpose);
-  }
-
-  for (Operation *transpose : transposes) {
-    sinkTransposeOp(cast<TransOp>(transpose));
+    transposeDotOp(dotOp);
   }
 }
 
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -226,38 +226,3 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
     tt.return %result : tensor<128x128xf32, #blocked>
   }
 }
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
-#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
-#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
-#blocked3 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
-  // CHECK-LABEL: dot_scale_transpose
-  tt.func public @dot_scale_transpose(%arg0: tensor<128x64xf8E4M3FN, #blocked>, %arg1: tensor<32x32xi8, #blocked1>, %arg2: tensor<32x2xi8, #blocked2>, %arg3: tensor<128x32x!tt.ptr<bf16>, #blocked3>) {
-    %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked1>
-    %c1_i32 = arith.constant 1 : i32
-    %c100_i32 = arith.constant 100 : i32
-    %c0_i32 = arith.constant 0 : i32
-    %cst_0 = arith.constant dense<32> : tensor<32x1xi32, #blocked3>
-    %cst_1 = arith.constant dense<2> : tensor<32x1xi32, #blocked2>
-    // CHECK: scf.for
-    %0 = scf.for %arg4 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg5 = %cst) -> (tensor<128x32xf32, #blocked1>)  : i32 {
-      // CHECK-DAG: tt.trans %{{.*}} {order = array<i32: 1, 0>} : tensor<128x64xf8E4M3FN, #{{.*}}> -> tensor<64x128xf8E4M3FN, #{{.*}}>
-      // CHECK-DAG: tt.trans %a{{.*}} {order = array<i32: 1, 0>} : tensor<32x32xi8, #{{.*}}> -> tensor<32x32xi8, #{{.*}}>
-      %3 = tt.dot_scaled %arg0, %arg1 scale %arg2, %arg5 lhs = e4m3 rhs = e2m1 {fastMath = false}: tensor<128x64xf8E4M3FN, #blocked> * tensor<32x32xi8, #blocked1>, tensor<32x2xi8, #blocked2> -> tensor<128x32xf32, #blocked1>
-      // CHECK: tt.dot
-      // CHECK-NOT: tt.trans
-      // CHECK: scf.yield
-      scf.yield %3 : tensor<128x32xf32, #blocked1>
-    }
-    // CHECK: arith.truncf
-    // CHECK: ttg.convert_layout
-    // CHECK: tt.trans
-    %1 = arith.truncf %0 : tensor<128x32xf32, #blocked1> to tensor<128x32xbf16, #blocked1>
-    %2 = ttg.convert_layout %1 : tensor<128x32xbf16, #blocked1> -> tensor<128x32xbf16, #blocked3>
-    tt.store %arg3, %2 : tensor<128x32x!tt.ptr<bf16>, #blocked3>
-    tt.return
-  }
-}
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
@@ -2848,3 +2848,50 @@ tt.func @reduce_linear_layouts(%arg0: tensor<32x32xi32, #linear>) -> tensor<32xi
 }
 
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked3 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
+#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+#linear = #ttg.linear<{register = [[16, 0]], lane = [[0, 1], [1, 0], [2, 0], [4, 0], [8, 0]], warp = [[0, 0], [0, 0]], block = []}>
+#mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+
+  // Test that after dot_scaled with rhs scales is decomposed, we are able to get rid of the redundant convert_layout
+  // CHECK-LABEL: dot_scale_transpose
+  tt.func public @dot_scale_transpose(%arg0: tensor<128x64xf8E4M3FN, #blocked>, %arg1: tensor<32x32xi8, #blocked1>, %arg2: tensor<32x2xi8, #blocked2>, %arg3: tensor<128x32x!tt.ptr<bf16>, #blocked3>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked1>
+    %c1_i32 = arith.constant 1 : i32
+    %c100_i32 = arith.constant 100 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = scf.for %arg4 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg5 = %cst) -> (tensor<128x32xf32, #blocked1>)  : i32 {
+      %3 = tt.trans %arg0 {order = array<i32: 1, 0>} : tensor<128x64xf8E4M3FN, #blocked> -> tensor<64x128xf8E4M3FN, #blocked4>
+      %4 = tt.trans %arg1 {order = array<i32: 1, 0>} : tensor<32x32xi8, #blocked1> -> tensor<32x32xi8, #blocked5>
+      %5 = tt.trans %arg5 {order = array<i32: 1, 0>} : tensor<128x32xf32, #blocked1> -> tensor<32x128xf32, #blocked5>
+      %6 = ttg.convert_layout %5 : tensor<32x128xf32, #blocked5> -> tensor<32x128xf32, #mma>
+      %7 = ttg.convert_layout %4 : tensor<32x32xi8, #blocked5> -> tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+      %8 = ttg.convert_layout %arg2 : tensor<32x2xi8, #blocked2> -> tensor<32x2xi8, #linear>
+      %9 = ttg.upcast_mxfp %7, %8 fp_type = e2m1 {fastMath = false} : tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<32x2xi8, #linear> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+      %10 = ttg.convert_layout %3 : tensor<64x128xf8E4M3FN, #blocked4> -> tensor<64x128xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+      %11 = tt.fp_to_fp %10 : tensor<64x128xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<64x128xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+      %12 = tt.dot %9, %11, %6 : tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x128xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<32x128xf32, #mma>
+      // CHECK: tt.dot
+      // CHECK-NOT: ttg.convert_layout
+      // CHECK: scf.yield
+      %13 = ttg.convert_layout %12 : tensor<32x128xf32, #mma> -> tensor<32x128xf32, #blocked5>
+      %14 = tt.trans %13 {order = array<i32: 1, 0>} : tensor<32x128xf32, #blocked5> -> tensor<128x32xf32, #blocked1>
+      scf.yield %14 : tensor<128x32xf32, #blocked1>
+    }
+    // CHECK: arith.truncf
+    // CHECK-NEXT: ttg.convert_layout
+    // CHECK-NEXT: tt.store
+    %1 = arith.truncf %0 : tensor<128x32xf32, #blocked1> to tensor<128x32xbf16, #blocked1>
+    %2 = ttg.convert_layout %1 : tensor<128x32xbf16, #blocked1> -> tensor<128x32xbf16, #blocked3>
+    tt.store %arg3, %2 : tensor<128x32x!tt.ptr<bf16>, #blocked3>
+    tt.return
+  }
+}