[BACKEND] Restrict pattern re-ordering of alloc and reshape (#8266)

ThomasRaoux · web-flow · commit 6e390f3f3cdf · 2025-09-23T18:27:19.000-07:00
This avoids extra round trip to register when the user needs mma layout.
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -165,6 +165,12 @@ class ReshapeMemDesc : public OpRewritePattern<LocalAllocOp> {
             getContext(), allocOp.getLoc(), allocType, srcShape, innerTy)))
       return failure();
 
+    // For now don't apply the transformation if the new encoding is not an
+    // MMAv3/v5 encoding as it may not be compatible with the user.
+    // The heuristic can be refined once we have more flexible mma ops.
+    if (!isa<NVMMASharedEncodingAttr>(innerTy.getEncoding()))
+      return failure();
+
     auto newAlloc = rewriter.create<LocalAllocOp>(allocOp.getLoc(), innerTy,
                                                   reshapeOp.getSrc());
     rewriter.replaceOpWithNewOp<MemDescReshapeOp>(allocOp, allocOp.getType(),
diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir
@@ -276,3 +276,20 @@ module attributes {"ttg.target" = "cuda:100", "ttg.num-ctas" = 1 : i32, "ttg.num
     tt.return %a: !ttg.memdesc<128x64xf16, #shared, #smem>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1, 1, 1], threadsPerWarp = [1, 1, 1, 32], warpsPerCTA = [1, 2, 2, 1], order = [3, 2, 1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 32}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "cuda:100", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: @reshape_memedesc_negative
+  tt.func @reshape_memedesc_negative(%arg: tensor<1x32x2x64xf32, #blocked>) -> !ttg.memdesc<64x64xf32, #shared, #smem> {
+    %r = tt.reshape %arg : tensor<1x32x2x64xf32, #blocked> -> tensor<64x64xf32, #blocked1>
+    // CHECK-NOT: ttg.memdesc_reshape
+    %a = ttg.local_alloc %r : (tensor<64x64xf32, #blocked1>) -> !ttg.memdesc<64x64xf32, #shared, #smem>
+    // CHECK: tt.return
+    tt.return %a: !ttg.memdesc<64x64xf32, #shared, #smem>
+  }
+}