[release/3.4] "[BACKEND] support tt::TransOp in comesFromLoadOrBlockArg (triton-lang#7343)" (triton-lang#7346)

davidberard98 · web-flow · commit f81f19a7f6cb · 2025-06-30T09:07:31.000-04:00
This patches a bug (pytorch/pytorch#156028) which was introduced by triton-lang#7066. PromoteLHSToTMem.cpp, and is intended to extend it to support all MemDescViewTrait ops. However, in this refactor, support for tt.TransOp was dropped, changing the behavior of AccelerateMatmul.cpp. This PR adds tt::TransOp back into the set of ops supported by comesFromLoadOrBlockArg. i.e.: * behavior before triton-lang#7066: comesFromLoadOrBlockArg tracks loads past: ttg::ConvertLayoutOp, **tt::TransOp** * behavior after triton-lang#7066: comesFromLoadOrBlockArg tracks loads past: ttg::ConvertLayoutOp, ttg::MemDescSubview, ttg::MemDescTransOp, ttg::MemDescReshapeOp, ttg::MemDescReinterpretOp * behavior after this PR: comesFromLoadOrBlockArg tracks loads past: ttg::ConvertLayoutOp, **tt::TransOp** ttg::MemDescSubview, ttg::MemDescTransOp, ttg::MemDescReshapeOp, ttg::MemDescReinterpretOp
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1568,6 +1568,10 @@ bool comesFromLoadOrBlockArg(Value v) {
       v = cvtOp.getSrc();
       continue;
     }
+    if (auto transOp = dyn_cast<tt::TransOp>(def)) {
+      v = transOp.getSrc();
+      continue;
+    }
     if (def->hasTrait<OpTrait::MemDescViewTrait>()) {
       v = def->getOperand(0);
       continue;
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -566,3 +566,36 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 4], order = [1, 0]}>
+#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: identify_load_then_trans
+  tt.func public @identify_load_then_trans(
+    %arg0: !tt.tensordesc<tensor<128x128xf16>>,
+    %arg1: !tt.tensordesc<tensor<128x128xf16>>,
+    %arg2: i32,
+    %arg3: i32,
+    %arg4: i32,
+    %arg5: tensor<128x128xf32, #blocked>
+  ) -> tensor<128x128xf32, #blocked> {
+    // CHECK:   %[[DESC0:.*]] = tt.descriptor_load %arg0
+    // CHECK:   %[[DESC1:.*]] = tt.descriptor_load %arg1
+    %13 = tt.descriptor_load %arg0[%arg4, %arg2] : !tt.tensordesc<tensor<128x128xf16>> -> tensor<128x128xf16, #blocked2>
+    %14 = tt.descriptor_load %arg1[%arg3, %arg4] : !tt.tensordesc<tensor<128x128xf16>> -> tensor<128x128xf16, #blocked2>
+    // CHECK:   %[[TRANS0:.*]] = tt.trans %[[DESC0]]
+    // CHECK:   %[[ALLOC0:.*]] = ttg.local_alloc %[[TRANS0]]
+    %15 = tt.trans %13 {order = array<i32: 1, 0>} : tensor<128x128xf16, #blocked2> -> tensor<128x128xf16, #blocked3>
+    // CHECK:   %[[TRANS1:.*]] = tt.trans %[[DESC1]]
+    // CHECK:   %[[ALLOC1:.*]] = ttg.local_alloc %[[TRANS1]]
+    %16 = tt.trans %14 {order = array<i32: 1, 0>} : tensor<128x128xf16, #blocked2> -> tensor<128x128xf16, #blocked3>
+    %17 = ttg.convert_layout %15 : tensor<128x128xf16, #blocked3> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %18 = ttg.convert_layout %16 : tensor<128x128xf16, #blocked3> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
+    // CHECK:   ttng.warp_group_dot %[[ALLOC0]], %[[ALLOC1]]
+    %19 = tt.dot %17, %18, %arg5, inputPrecision = tf32 : tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x128xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
+    tt.return %19 : tensor<128x128xf32, #blocked>
+  }
+}