[AMD] Emit shared memory ops for M/N packed FP4 (#7626)

nzaghen · web-flow · commit 4bcdbdee14a9 · 2025-07-31T09:37:40.000-07:00
This adds support for FP4 packed along M/N.

MFMA only support K packed inputs so we need to transpose the inputs
from M/N packed M/N contiguous to K packed K contiguous. This is
achieved by changing the contiguity of the tensor (as Triton is
providing K contiguous data) when storing it into shared memory and then
transposing the data using the LocalLoadPackedTransposedOp.
In order to keep the scope of the change as localised as possible, this
can be achieved directly in AccelerateAMDMatmul.

This change achieves two different things:
- Store the input tensor in shared memory in M/N contiguous way
(swapping the shared layout order)
- Transpose the tensor using ds_read_b64_tr4
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir
@@ -223,3 +223,74 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 16, perPhase = 4, maxPhase = 4, order = [1, 0]}>
+// CHECK-LABEL: mfma_dot_scaled_mxfp4_b_packed_mn
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_mxfp4_b_packed_mn(
+      %a: tensor<128x128xf8E5M2, #blocked>,
+      %b: tensor<128x64xi8, #blocked1>,
+      %c: tensor<128x128xf32, #blocked>,
+      %arg4: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    %b1 = ttg.convert_layout %b : tensor<128x64xi8, #blocked1> -> tensor<128x64xi8, #blocked>
+    // CHECK: %[[ALLOCB:.+]] = ttg.local_alloc {{.*}} : (tensor<128x64xi8, #blocked>) -> !ttg.memdesc<128x64xi8, #shared, #smem>
+    // CHECK: %[[B:.+]] = amdgpu.local_load_packed_tranposed  %[[ALLOCB]] : !ttg.memdesc<128x64xi8, #shared, #smem> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>
+    // CHECK: tt.dot_scaled %{{.*}}, %[[B]], %{{.*}} lhs = e5m2 rhs = e2m1 {fastMath = false}
+    %accumulator_52 = tt.dot_scaled %a, %b1, %c lhs = e5m2 rhs = e2m1 {fastMath = false, rhs_k_pack = false} : tensor<128x128xf8E5M2, #blocked> * tensor<128x64xi8, #blocked> -> tensor<128x128xf32, #blocked>
+    tt.store %arg4, %accumulator_52 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 16, perPhase = 4, maxPhase = 4, order = [0, 1]}>
+// CHECK-LABEL: mfma_dot_scaled_mxfp4_a_packed_mn
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_mxfp4_a_packed_mn(
+      %a: tensor<64x128xi8, #blocked>,
+      %b: tensor<128x128xf8E5M2, #blocked1>,
+      %c: tensor<128x128xf32, #blocked>,
+      %arg4: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    %b1 = ttg.convert_layout %b : tensor<128x128xf8E5M2, #blocked1> -> tensor<128x128xf8E5M2, #blocked>
+    // CHECK: %[[ALLOCA:.+]] = ttg.local_alloc {{.*}} : (tensor<64x128xi8, #blocked>) -> !ttg.memdesc<64x128xi8, #shared, #smem>
+    // CHECK: %[[A:.+]] = amdgpu.local_load_packed_tranposed  %[[ALLOCA]] : !ttg.memdesc<64x128xi8, #shared, #smem> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+    // CHECK: tt.dot_scaled %[[A]], %{{.*}}, %{{.*}} lhs = e2m1 rhs = e5m2 {fastMath = false}
+    %accumulator_52 = tt.dot_scaled %a, %b1, %c lhs = e2m1 rhs = e5m2 {fastMath = false, lhs_k_pack = false} : tensor<64x128xi8, #blocked> * tensor<128x128xf8E5M2, #blocked> -> tensor<128x128xf32, #blocked>
+    tt.store %arg4, %accumulator_52 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 16, perPhase = 4, maxPhase = 4, order = [0, 1]}>
+// CHECK{LITERAL}: #shared1 = #ttg.swizzled_shared<{vec = 16, perPhase = 4, maxPhase = 4, order = [1, 0]}>
+// CHECK-LABEL: mfma_dot_scaled_mxfp4_ab_packed_mn
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_mxfp4_ab_packed_mn(
+      %a: tensor<64x128xi8, #blocked>,
+      %b: tensor<128x64xi8, #blocked1>,
+      %c: tensor<128x128xf32, #blocked>,
+      %arg4: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    %b1 = ttg.convert_layout %b : tensor<128x64xi8, #blocked1> -> tensor<128x64xi8, #blocked>
+    // CHECK: %[[ALLOCA:.+]] = ttg.local_alloc {{.*}} : (tensor<64x128xi8, #blocked>) -> !ttg.memdesc<64x128xi8, #shared, #smem>
+    // CHECK: %[[A:.+]] = amdgpu.local_load_packed_tranposed  %[[ALLOCA]] : !ttg.memdesc<64x128xi8, #shared, #smem> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+    // CHECK: %[[ALLOCB:.+]] = ttg.local_alloc {{.*}} : (tensor<128x64xi8, #blocked>) -> !ttg.memdesc<128x64xi8, #shared1, #smem>
+    // CHECK: %[[B:.+]] = amdgpu.local_load_packed_tranposed  %[[ALLOCB]] : !ttg.memdesc<128x64xi8, #shared1, #smem> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>
+    // CHECK: tt.dot_scaled %[[A]], %[[B]], %{{.*}} lhs = e2m1 rhs = e2m1 {fastMath = false}
+    %accumulator_52 = tt.dot_scaled %a, %b1, %c lhs = e2m1 rhs = e2m1 {fastMath = false, lhs_k_pack = false, rhs_k_pack = false} : tensor<64x128xi8, #blocked> * tensor<128x64xi8, #blocked> -> tensor<128x128xf32, #blocked>
+    tt.store %arg4, %accumulator_52 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -797,3 +797,123 @@ tt.func @pipeline_fp64_with_async_copy_gfx950(
   tt.return %loop: tensor<128x128xf64, #C>
 }
 }
+
+// -----
+
+// COMMON-LABEL: pipelining_local_load_packed_transposed
+
+// Prologue
+// COMMON: ttg.local_alloc
+// COMMON: ttg.local_alloc
+// ASYNC: ttg.async_copy_global_to_local
+// SYNC: tt.load
+// COMMON: tt.load
+// SYNC: ttg.local_store
+// COMMON: ttg.local_store
+
+// Main loop
+//         COMMON: scf.for
+//         COMMON:   ttg.local_load
+//         COMMON:   amdgpu.local_load_packed_tranposed
+//         COMMON:   tt.dot_scaled
+//         COMMON:   scf.yield
+
+// Epilogue
+//         COMMON:   ttg.local_load
+//         COMMON: amdgpu.local_load_packed_tranposed
+//         COMMON: scf.if
+//         COMMON:   tt.dot_scaled
+// COMMON-COUNT-2:   scf.yield
+// COMMON-COUNT-2: ttg.local_dealloc
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = true}>
+#shared = #ttg.swizzled_shared<{vec = 16, perPhase = 4, maxPhase = 4, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @pipelining_local_load_packed_transposed(%a_ptr: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %b_ptr: !tt.ptr<i8> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %output_ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %M: i32 {tt.divisibility = 16 : i32}, %N: i32 {tt.divisibility = 16 : i32}, %K: i32 {tt.divisibility = 16 : i32}, %stride_scale: i32 {tt.divisibility = 16 : i32}, %stride_am: i32 {tt.divisibility = 16 : i32}, %stride_bn: i32 {tt.divisibility = 16 : i32}, %stride_cm: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<128x128xi32, #blocked>
+    %cst_0 = arith.constant dense<128> : tensor<128x64xi32, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c127_i32 = arith.constant 127 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.addi %M, %c127_i32 : i32
+    %2 = arith.divsi %1, %c128_i32 : i32
+    %3 = arith.remsi %0, %2 : i32
+    %4 = arith.divsi %0, %2 : i32
+    %5 = arith.muli %3, %c128_i32 : i32
+    %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %7 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
+    %8 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>>
+    %9 = tt.splat %5 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %10 = tt.splat %5 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
+    %11 = arith.addi %9, %6 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %12 = arith.addi %10, %7 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
+    %13 = arith.muli %4, %c128_i32 : i32
+    %14 = arith.divsi %13, %c2_i32 : i32
+    %15 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
+    %16 = tt.splat %14 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
+    %17 = arith.addi %16, %15 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
+    %18 = tt.expand_dims %11 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked>
+    %19 = tt.expand_dims %12 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2>
+    %20 = tt.splat %stride_am : i32 -> tensor<128x1xi32, #blocked>
+    %21 = arith.muli %18, %20 : tensor<128x1xi32, #blocked>
+    %22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %23 = tt.expand_dims %22 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked>
+    %24 = tt.broadcast %21 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked>
+    %25 = tt.broadcast %23 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked>
+    %26 = arith.addi %24, %25 : tensor<128x128xi32, #blocked>
+    %27 = tt.splat %a_ptr : !tt.ptr<f8E5M2> -> tensor<128x128x!tt.ptr<f8E5M2>, #blocked>
+    %28 = tt.addptr %27, %26 : tensor<128x128x!tt.ptr<f8E5M2>, #blocked>, tensor<128x128xi32, #blocked>
+    %29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
+    %30 = tt.expand_dims %29 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1>
+    %31 = tt.expand_dims %17 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1>
+    %32 = tt.splat %stride_bn : i32 -> tensor<1x64xi32, #blocked1>
+    %33 = arith.muli %31, %32 : tensor<1x64xi32, #blocked1>
+    %34 = tt.broadcast %30 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1>
+    %35 = tt.broadcast %33 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1>
+    %36 = arith.addi %34, %35 : tensor<128x64xi32, #blocked1>
+    %37 = tt.splat %b_ptr : !tt.ptr<i8> -> tensor<128x64x!tt.ptr<i8>, #blocked1>
+    %38 = tt.addptr %37, %36 : tensor<128x64x!tt.ptr<i8>, #blocked1>, tensor<128x64xi32, #blocked1>
+    %39 = arith.addi %K, %c127_i32 : i32
+    %40 = arith.divsi %39, %c128_i32 : i32
+    %accumulator:3 = scf.for %accumulator_2 = %c0_i32 to %40 step %c1_i32 iter_args(%arg11 = %cst_1, %arg12 = %28, %arg13 = %38) -> (tensor<128x128xf32, #mma>, tensor<128x128x!tt.ptr<f8E5M2>, #blocked>, tensor<128x64x!tt.ptr<i8>, #blocked1>)  : i32 {
+      %60 = tt.load %arg12 : tensor<128x128x!tt.ptr<f8E5M2>, #blocked>
+      %61 = tt.load %arg13 : tensor<128x64x!tt.ptr<i8>, #blocked1>
+      %62 = ttg.convert_layout %60 : tensor<128x128xf8E5M2, #blocked> -> tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+      %63 = ttg.local_alloc %61 : (tensor<128x64xi8, #blocked1>) -> !ttg.memdesc<128x64xi8, #shared, #smem>
+      %64 = amdgpu.local_load_packed_tranposed %63 : !ttg.memdesc<128x64xi8, #shared, #smem> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>
+      %65 = tt.dot_scaled %62, %64, %arg11 lhs = e5m2 rhs = e2m1 {fastMath = false} : tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> * tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>> -> tensor<128x128xf32, #mma>
+      %66 = tt.addptr %arg12, %cst : tensor<128x128x!tt.ptr<f8E5M2>, #blocked>, tensor<128x128xi32, #blocked>
+      %67 = tt.addptr %arg13, %cst_0 : tensor<128x64x!tt.ptr<i8>, #blocked1>, tensor<128x64xi32, #blocked1>
+      scf.yield %65, %66, %67 : tensor<128x128xf32, #mma>, tensor<128x128x!tt.ptr<f8E5M2>, #blocked>, tensor<128x64x!tt.ptr<i8>, #blocked1>
+    } {tt.num_stages = 2 : i32}
+    %41 = tt.splat %13 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>>
+    %42 = arith.addi %41, %8 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>>
+    %43 = tt.splat %stride_cm : i32 -> tensor<128x1xi32, #blocked2>
+    %44 = arith.muli %43, %19 : tensor<128x1xi32, #blocked2>
+    %45 = tt.splat %output_ptr : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked2>
+    %46 = tt.addptr %45, %44 : tensor<128x1x!tt.ptr<f32>, #blocked2>, tensor<128x1xi32, #blocked2>
+    %47 = tt.expand_dims %42 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2>
+    %48 = tt.broadcast %46 : tensor<128x1x!tt.ptr<f32>, #blocked2> -> tensor<128x128x!tt.ptr<f32>, #blocked2>
+    %49 = tt.broadcast %47 : tensor<1x128xi32, #blocked2> -> tensor<128x128xi32, #blocked2>
+    %50 = tt.addptr %48, %49 : tensor<128x128x!tt.ptr<f32>, #blocked2>, tensor<128x128xi32, #blocked2>
+    %51 = tt.splat %M : i32 -> tensor<128x1xi32, #blocked2>
+    %52 = arith.cmpi slt, %19, %51 : tensor<128x1xi32, #blocked2>
+    %53 = tt.splat %N : i32 -> tensor<1x128xi32, #blocked2>
+    %54 = arith.cmpi slt, %47, %53 : tensor<1x128xi32, #blocked2>
+    %55 = tt.broadcast %52 : tensor<128x1xi1, #blocked2> -> tensor<128x128xi1, #blocked2>
+    %56 = tt.broadcast %54 : tensor<1x128xi1, #blocked2> -> tensor<128x128xi1, #blocked2>
+    %57 = arith.andi %55, %56 : tensor<128x128xi1, #blocked2>
+    %58 = ttg.convert_layout %50 : tensor<128x128x!tt.ptr<f32>, #blocked2> -> tensor<128x128x!tt.ptr<f32>, #mma>
+    %59 = ttg.convert_layout %57 : tensor<128x128xi1, #blocked2> -> tensor<128x128xi1, #mma>
+    tt.store %58, %accumulator#0, %59 : tensor<128x128x!tt.ptr<f32>, #mma>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -209,7 +209,7 @@ FailureOr<MfmaIntrinsic> chooseMfmaInstruction(tt::DotScaledOp dot,
                                                int mfmaVersion, int nonKDim) {
   auto ctx = dot.getContext();
   int64_t inputKDim = dot.getA().getType().getShape().back();
-  if (dot.getAElemType() == ScaleDotElemType::E2M1) {
+  if (dot.getAElemType() == ScaleDotElemType::E2M1 && dot.getLhsKPack()) {
     // Since two fp4 are packed into int8, to get the correct K dim size, we
     // need to multiply it by 2.
     inputKDim *= 2;
@@ -928,11 +928,56 @@ class ScaledBlockedToScaledMFMAF8F6F4 final
       auto newEnc =
           DotOperandEncodingAttr::get(ctx, opIdx, mfmaEnc, kWidth / 2);
 
-      (opIdx == 0 ? aEncLL : bEncLL) *=
-          newEnc.toLinearLayout(opIdx == 0 ? aShape : bShape);
-      auto newVType = RankedTensorType::get(vType.getShape(),
-                                            vType.getElementType(), newEnc);
-      return rewriter.create<ttg::ConvertLayoutOp>(v.getLoc(), newVType, v);
+      bool kPacked = opIdx == 0 ? dotOp.getLhsKPack() : dotOp.getRhsKPack();
+      if (kPacked == false) {
+        // This is FP4 with M/N packing. Create local alloc + local load here
+        // so we have control of the shared layout
+        // A, M packed: tensor<16x64xi8> --> 32x32
+        // B, N packed: tensor<64x16xi8> --> 32x32
+        SmallVector<int64_t> newShape(vType.getShape());
+        newShape[opIdx == 0 ? 0 : 1] = newShape[opIdx == 0 ? 0 : 1] * 2;
+        newShape[opIdx == 0 ? 1 : 0] = newShape[opIdx == 0 ? 1 : 0] / 2;
+        auto newVType =
+            RankedTensorType::get(newShape, vType.getElementType(), newEnc);
+        OpBuilder builder(dotOp);
+        auto srcEncoding = vType.getEncoding();
+        auto originalOrder = triton::gpu::getOrderForMemory(vType);
+        SmallVector<unsigned> newOrder = originalOrder;
+        if (opIdx == 1) {
+          newOrder = {1, 0};
+        } else {
+          newOrder = {0, 1};
+        }
+        auto sharedMemorySpace =
+            triton::gpu::SharedMemorySpaceAttr::get(vType.getContext());
+        auto tmpType = triton::gpu::MemDescType::get(
+            vType.getShape(), vType.getElementType(),
+            triton::gpu::SwizzledSharedEncodingAttr::get(
+                v.getContext(), newEnc, vType.getShape(), newOrder,
+                triton::gpu::getCTALayout(srcEncoding), vType.getElementType()),
+            sharedMemorySpace);
+        auto tmp = builder.create<triton::gpu::LocalAllocOp>(dotOp.getLoc(),
+                                                             tmpType, v);
+        auto newConvert =
+            builder.create<triton::amdgpu::LocalLoadPackedTransposedOp>(
+                dotOp.getLoc(), newVType, tmp);
+        if (opIdx == 0) {
+          aShape = newConvert.getType().getShape();
+          aEncLL *= newEnc.toLinearLayout(aShape);
+        } else {
+          bShape = newConvert.getType().getShape();
+          bEncLL *= newEnc.toLinearLayout(bShape);
+        }
+        return newConvert;
+      } else {
+        if (opIdx == 0)
+          aEncLL *= newEnc.toLinearLayout(aShape);
+        else
+          bEncLL *= newEnc.toLinearLayout(bShape);
+        auto newVType = RankedTensorType::get(vType.getShape(),
+                                              vType.getElementType(), newEnc);
+        return rewriter.create<ttg::ConvertLayoutOp>(v.getLoc(), newVType, v);
+      }
     };
     a = convertInputLayout(a, 0);
     b = convertInputLayout(b, 1);