[AMD] Drop deprecated pattern in OptimizeDotOperands pass (#8716)

PMylon · web-flow · commit cbd5d488931b · 2025-11-13T07:59:38.000-08:00
diff --git a/test/TritonGPU/amd/amd-optimize-dot-operands.mlir b/test/TritonGPU/amd/amd-optimize-dot-operands.mlir
@@ -1,129 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx950" | FileCheck %s --check-prefixes CHECK,GFX950
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx942" | FileCheck %s
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
-#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
-#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
-#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
-// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
-// CHECK{LITERAL}: #smem = #ttg.shared_memory
-// CHECK-LABEL: test_local_load_transposed
-// CHECK: %[[LOAD:.+]] = tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked>
-// CHECK: %[[ALLOC:.+]] = ttg.local_alloc %[[LOAD]] : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem>
-// CHECK: %[[LOCAL_LOAD_TRANS:.+]] = ttg.local_load %[[ALLOC]] : !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<64x16xf16, #linear>
-// CHECK: %[[LOCAL_LOAD_DIRECT:.+]] = ttg.local_load %[[ALLOC]] : !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
-// CHECK: tt.dot {{.+}}, %[[LOCAL_LOAD_DIRECT]], {{.+}}: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x16xf32, #mma>
-// CHECK: %[[TRANS:.+]] = tt.trans %[[LOCAL_LOAD_TRANS]] {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
-// CHECK: tt.dot {{.+}}, %[[TRANS]], {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_local_load_transposed(
-    %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
-    %out0 : tensor<128x16x!tt.ptr<f32>, #blocked>,
-    %out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
-  ) {
-    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
-    %cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>>
-    %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
-    %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
-
-    %0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
-    %1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
-    %2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
-    %3 = tt.dot %cst_1, %2, %cst_0 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x16xf32, #mma1>
-    %4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
-    %5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
-
-    %6 = ttg.convert_layout %3 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked>
-    %7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
-    tt.store %out0, %6 : tensor<128x16x!tt.ptr<f32>, #blocked>
-    tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked>
-    tt.return
-  }
-}
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
-#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
-#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
-#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
-// CHECK-NOT: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
-// CHECK-NOT: #smem = #ttg.shared_memory
-// CHECK-LABEL: test_not_local_load_transposed_kWidth_mismatch
-// CHECK: tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked>
-// CHECK-NOT: ttg.local_alloc
-// CHECK-NOT: ttg.local_load
-// CHECK-NOT: ttg.local_load
-// CHECK: tt.dot {{.+}}: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<128x16xf32, #mma>
-// CHECK: tt.trans {{.+}} {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
-// CHECK: tt.dot {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_not_local_load_transposed_kWidth_mismatch(
-    %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
-    %out0 : tensor<128x16x!tt.ptr<f32>, #blocked>,
-    %out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
-  ) {
-    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
-    %cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>>
-    %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
-    %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
-
-    %0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
-    %1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
-    %2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>>
-    %3 = tt.dot %cst_1, %2, %cst_0 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<128x16xf32, #mma1>
-    %4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
-    %5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
-
-    %6 = ttg.convert_layout %3 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked>
-    %7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
-    tt.store %out0, %6 : tensor<128x16x!tt.ptr<f32>, #blocked>
-    tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked>
-    tt.return
-  }
-}
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
-#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
-#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
-#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
-// CHECK-NOT: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
-// CHECK-NOT: #smem = #ttg.shared_memory
-// CHECK-LABEL: test_not_local_load_transposed_opIdx_mismatch
-// CHECK: tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked>
-// CHECK-NOT: ttg.local_alloc
-// CHECK-NOT: ttg.local_load
-// CHECK-NOT: ttg.local_load
-// CHECK: tt.dot {{.+}}: tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<64x64xf32, #mma>
-// CHECK: tt.trans {{.+}} {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
-// CHECK: tt.dot {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_not_local_load_transposed_opIdx_mismatch(
-    %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
-    %out0 : tensor<64x64x!tt.ptr<f32>, #blocked>,
-    %out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
-  ) {
-    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma1>
-    %cst_1 = arith.constant dense<0.693147182> : tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
-    %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
-    %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
-
-    %0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
-    %1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
-    %2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>>
-    %3 = tt.dot %2, %cst_1, %cst_0 : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<64x64xf32, #mma1>
-    %4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
-    %5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
-
-    %6 = ttg.convert_layout %3 : tensor<64x64xf32, #mma1> -> tensor<64x64xf32, #blocked>
-    %7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
-    tt.store %out0, %6 : tensor<64x64x!tt.ptr<f32>, #blocked>
-    tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked>
-    tt.return
-  }
-}
-
-// -----
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx950" | FileCheck %s --check-prefixes GFX950
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [2, 1], order = [1, 0]}>
 #linear = #ttg.linear<{register = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [1, 0, 0], [2, 0, 0], [0, 32, 0], [0, 64, 0]], lane = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 8, 0], [0, 0, 8], [0, 0, 16]], warp = [[0, 16, 0]], block = []}>
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeDotOperands.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeDotOperands.cpp
@@ -26,185 +26,6 @@ namespace mlir::triton::amdgpu {
 
 namespace {
 
-// Detect a pair of tt.dot ops that both use the same tt.load result, one
-// directly and one via tt.trans and use the same shared memory buffer in this
-// case. Given:
-//   load -> cvt -> .. -> dot1
-//        -> cvt -> .. -> trans -> cvt -> .. -> dot2
-// Rewrite to:
-//  load -> local_alloc -> local_load            -> dot1
-//                      -> local_load_transposed -> dot2
-class ReuseShmemForDirectAndTransposedUse : public OpRewritePattern<LoadOp> {
-public:
-  ReuseShmemForDirectAndTransposedUse(MLIRContext *context,
-                                      triton::AMD::ISAFamily isaFamily)
-      : OpRewritePattern(context), isaFamily(isaFamily) {}
-
-  LogicalResult matchAndRewrite(tt::LoadOp loadOp,
-                                PatternRewriter &rewriter) const override {
-    auto numUsers = llvm::range_size(loadOp->getUsers());
-    if (numUsers < 2) {
-      return rewriter.notifyMatchFailure(loadOp,
-                                         "load op must have at least 2 users");
-    }
-
-    auto srcTy = dyn_cast<RankedTensorType>(loadOp.getType());
-    if (!srcTy) {
-      return rewriter.notifyMatchFailure(loadOp, "src type must be a tensor");
-    }
-
-    LDBG("ReuseShmemForDirectAndTransposedUse for load Op: " << *loadOp);
-
-    tt::DotOpInterface directDot = nullptr;
-    tt::DotOpInterface transDot = nullptr;
-    ttg::ConvertLayoutOp cvtOp = nullptr;
-    unsigned directOpIdx = 0;
-    unsigned transOpIdx = 0;
-
-    auto followConvertLayoutChain =
-        [](mlir::Value &usedValue, mlir::Operation *op) -> mlir::Operation * {
-      while (isa<ttg::ConvertLayoutOp>(op)) {
-        // Ensure we have exactly one user
-        if (!(op->hasOneUse()))
-          return nullptr;
-        usedValue = op->getResult(0);
-        op = *(op->getUsers().begin());
-      }
-
-      return op;
-    };
-
-    mlir::Value usedValue;
-    for (mlir::Operation *user : loadOp->getUsers()) {
-      auto op = user;
-
-      op = followConvertLayoutChain(usedValue, op);
-
-      if (auto transOp = dyn_cast_or_null<tt::TransOp>(op)) {
-        LDBG("Found tranpose op: " << *transOp);
-        cvtOp = transOp.getSrc().getDefiningOp<ttg::ConvertLayoutOp>();
-        LDBG("Found parent cvt op of transpose: " << *cvtOp);
-        usedValue = transOp->getResult(0);
-        op =
-            followConvertLayoutChain(usedValue, *(transOp->getUsers().begin()));
-        if (auto dotOp = dyn_cast<tt::DotOpInterface>(op)) {
-          transDot = dotOp;
-          transOpIdx = (usedValue == dotOp.getA()) ? 0 : 1;
-        }
-      } else if (auto dotOp = dyn_cast_or_null<tt::DotOpInterface>(op)) {
-        directDot = dotOp;
-        directOpIdx = (usedValue == dotOp.getA()) ? 0 : 1;
-      }
-
-      if (directDot && transDot)
-        break;
-    }
-
-    if (!directDot)
-      return rewriter.notifyMatchFailure(loadOp,
-                                         "expected a direct tt.dot user");
-    if (!transDot)
-      return rewriter.notifyMatchFailure(
-          loadOp, "expected a tt.trans feeding a tt.dot user");
-    if (directOpIdx != transOpIdx) {
-      return rewriter.notifyMatchFailure(loadOp, [&](mlir::Diagnostic &d) {
-        d << "operand indices of direct and transposed tt.dot users must be "
-             "the same. Got indices: direct: "
-          << directOpIdx << " and transposed: " << transOpIdx;
-      });
-    }
-
-    LDBG("load is shared between transposed and non-transposed users");
-    LDBG("Non-transposed access tt.dot: " << *directDot);
-    LDBG("Transposed access tt.dot: " << *transDot);
-
-    unsigned opIdx = directOpIdx;
-
-    auto directOperandType =
-        cast<RankedTensorType>(directDot->getOperand(opIdx).getType());
-    auto transOperandType =
-        cast<RankedTensorType>(transDot->getOperand(opIdx).getType());
-    auto directDotEnc =
-        dyn_cast<ttg::DotOperandEncodingAttr>(directOperandType.getEncoding());
-    auto transDotEnc =
-        dyn_cast<ttg::DotOperandEncodingAttr>(transOperandType.getEncoding());
-
-    if (!directDotEnc || !transDotEnc) {
-      return rewriter.notifyMatchFailure(loadOp,
-                                         "wrong encodings for tt.dot users");
-    }
-
-    if (directDotEnc.getKWidth() != transDotEnc.getKWidth()) {
-      return rewriter.notifyMatchFailure(loadOp, [&](mlir::Diagnostic &d) {
-        d << "kWidths are mismatching. direct: " << directDotEnc.getKWidth()
-          << " and transposed: " << transDotEnc.getKWidth();
-      });
-    }
-
-    // We need to ensure that the parents of direct and transposed dot encodings
-    // are matching in order to get the same shared memory encoding. Note that
-    // they can have different instrShape(s) (mfma instructions) but still map
-    // to the same shared memory encoding.
-    auto directCTALayout = ttg::getCTALayout(directDotEnc);
-    auto transCTALayout = ttg::getCTALayout(transDotEnc);
-
-    if (directCTALayout != transCTALayout) {
-      return rewriter.notifyMatchFailure(
-          loadOp,
-          "CTA layouts of direct and transposed tt.dot users are mismatching");
-    }
-
-    auto ctx = getContext();
-    auto sharedOrder = ttg::getOrderForMemory(srcTy);
-    auto sharedEnc = ttg::SwizzledSharedEncodingAttr::get(
-        ctx, directDotEnc, directOperandType.getShape(), sharedOrder,
-        directCTALayout, directOperandType.getElementType(),
-        /*needTrans=*/false);
-
-    LDBG("Created shared encoding: " << sharedEnc);
-    rewriter.setInsertionPointAfter(loadOp);
-    auto sharedMemorySpace = ttg::SharedMemorySpaceAttr::get(ctx);
-    Location loc = loadOp.getLoc();
-    auto alloc = ttg::LocalAllocOp::create(
-        rewriter, loc,
-        ttg::MemDescType::get(srcTy.getShape(), srcTy.getElementType(),
-                              sharedEnc, sharedMemorySpace),
-        loadOp.getResult());
-    LDBG("Created local alloc op: " << *alloc);
-    auto localLoad =
-        ttg::LocalLoadOp::create(rewriter, loc, directOperandType, alloc);
-    LDBG("Created local load op:" << *localLoad);
-    rewriter.modifyOpInPlace(
-        directDot, [&]() { directDot->setOperand(opIdx, localLoad); });
-    LDBG("Updated Direct dot: " << *directDot);
-    if (!canUseLocalLoadTransposed(opIdx, sharedOrder)) {
-      rewriter.modifyOpInPlace(cvtOp, [&]() {
-        cvtOp.getSrcMutable().assign(localLoad.getResult());
-      });
-      LDBG("Updated cvt op: " << *cvtOp);
-    } else {
-      return rewriter.notifyMatchFailure(loadOp, "currently not supported");
-    }
-
-    LDBG("Updated Trans dot: " << *transDot);
-
-    return success();
-  }
-
-private:
-  bool canUseLocalLoadTransposed(unsigned opIdx,
-                                 ArrayRef<unsigned> sharedOrder) const {
-    // TODO(PMylon): Comment out for now, until lowering from
-    // local_load_transposed to ds_read_tr is supported.
-    // unsigned kDimIdx = (opIdx == 0) ? 1 : 0;
-    // bool isCDNA4 = (isaFamily == triton::AMD::ISAFamily::CDNA4);
-    // bool isKContig = (sharedOrder[0] == kDimIdx);
-    return false;
-  }
-
-  triton::AMD::ISAFamily isaFamily;
-};
-
 // This pattern creates LocalAllocOp and LocalLoadOp with unswizzled shared
 // layout for the scale operand used in ScaledUpcastFp4Op/ScaledUpcastFp8Op.
 // StreamPipeliner will respect the layout created here and pipeline ops
@@ -304,7 +125,6 @@ class TritonAMDGPUOptimizeDotOperands
 
     mlir::RewritePatternSet patterns(context);
     auto isaFamily = triton::AMD::deduceISAFamily(archGenerationName);
-    patterns.add<ReuseShmemForDirectAndTransposedUse>(context, isaFamily);
     patterns
         .add<AllocSharedMemForUpcastedScales<tt::amdgpu::ScaledUpcastFp8Op>,
              AllocSharedMemForUpcastedScales<tt::amdgpu::ScaledUpcastFp4Op>>(