[BACKEND] support tt::TransOp in comesFromLoadOrBlockArg (#7343)

davidberard98 · web-flow · commit de7301e286da · 2025-06-27T11:34:52.000-07:00
This patches a bug (pytorch/pytorch#156028) which was introduced by #7066. #7066 refactors comesFromLoadOrBlockArg so it can be used in PromoteLHSToTMem.cpp, and is intended to extend it to support all MemDescViewTrait ops. However, in this refactor, support for tt.TransOp was dropped, changing the behavior of AccelerateMatmul.cpp. This PR adds tt::TransOp back into the set of ops supported by comesFromLoadOrBlockArg. i.e.: * behavior before #7066: comesFromLoadOrBlockArg tracks loads past: ttg::ConvertLayoutOp, **tt::TransOp** * behavior after #7066: comesFromLoadOrBlockArg tracks loads past: ttg::ConvertLayoutOp, ttg::MemDescSubview, ttg::MemDescTransOp, ttg::MemDescReshapeOp, ttg::MemDescReinterpretOp * behavior after this PR: comesFromLoadOrBlockArg tracks loads past: ttg::ConvertLayoutOp, **tt::TransOp** ttg::MemDescSubview, ttg::MemDescTransOp, ttg::MemDescReshapeOp, ttg::MemDescReinterpretOp
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1573,6 +1573,10 @@ bool comesFromLoadOrBlockArg(Value v) {
       v = cvtOp.getSrc();
       continue;
     }
+    if (auto transOp = dyn_cast<tt::TransOp>(def)) {
+      v = transOp.getSrc();
+      continue;
+    }
     if (def->hasTrait<OpTrait::MemDescViewTrait>()) {
       v = def->getOperand(0);
       continue;
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -566,3 +566,36 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 4], order = [1, 0]}>
+#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: identify_load_then_trans
+  tt.func public @identify_load_then_trans(
+    %arg0: !tt.tensordesc<tensor<128x128xf16>>,
+    %arg1: !tt.tensordesc<tensor<128x128xf16>>,
+    %arg2: i32,
+    %arg3: i32,
+    %arg4: i32,
+    %arg5: tensor<128x128xf32, #blocked>
+  ) -> tensor<128x128xf32, #blocked> {
+    // CHECK:   %[[DESC0:.*]] = tt.descriptor_load %arg0
+    // CHECK:   %[[DESC1:.*]] = tt.descriptor_load %arg1
+    %13 = tt.descriptor_load %arg0[%arg4, %arg2] : !tt.tensordesc<tensor<128x128xf16>> -> tensor<128x128xf16, #blocked2>
+    %14 = tt.descriptor_load %arg1[%arg3, %arg4] : !tt.tensordesc<tensor<128x128xf16>> -> tensor<128x128xf16, #blocked2>
+    // CHECK:   %[[TRANS0:.*]] = tt.trans %[[DESC0]]
+    // CHECK:   %[[ALLOC0:.*]] = ttg.local_alloc %[[TRANS0]]
+    %15 = tt.trans %13 {order = array<i32: 1, 0>} : tensor<128x128xf16, #blocked2> -> tensor<128x128xf16, #blocked3>
+    // CHECK:   %[[TRANS1:.*]] = tt.trans %[[DESC1]]
+    // CHECK:   %[[ALLOC1:.*]] = ttg.local_alloc %[[TRANS1]]
+    %16 = tt.trans %14 {order = array<i32: 1, 0>} : tensor<128x128xf16, #blocked2> -> tensor<128x128xf16, #blocked3>
+    %17 = ttg.convert_layout %15 : tensor<128x128xf16, #blocked3> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %18 = ttg.convert_layout %16 : tensor<128x128xf16, #blocked3> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
+    // CHECK:   ttng.warp_group_dot %[[ALLOC0]], %[[ALLOC1]]
+    %19 = tt.dot %17, %18, %arg5, inputPrecision = tf32 : tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
+    tt.return %19 : tensor<128x128xf32, #blocked>
+  }
+}