intel
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 4 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 135 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 12 additions & 0 deletions b/‎python/test/unit/language/test_core.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎python/triton/runtime/interpreter.py‎
Lines changed: 14 additions & 9 deletions b/‎python/triton/runtime/interpreter.py‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎test/Conversion/amd/async_ops_to_llvm.mlir‎
Lines changed: 4 additions & 4 deletions b/‎test/Conversion/amd/async_ops_to_llvm.mlir‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎test/Conversion/amd/ds_transpose.mlir‎
Lines changed: 77 additions & 0 deletions b/‎test/Conversion/amd/ds_transpose.mlir‎
Lines changed: 77 additions & 0 deletions
@@ -1 +1 @@
-ffe3129e9bdc146ee4d91e849173d1c64b1ae974
+1188b1ff7b956cb65d8ddda5f1e56c432f1a57c7
@@ -1055,7 +1055,10 @@ def CallOp : TT_Op<"call", [CallOpInterface, /*MemRefsNormalizable, */DeclareOpI
     ```
   }];
 
-  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let arguments = (ins FlatSymbolRefAttr:$callee,
+                   Variadic<AnyType>:$operands,
+                   OptionalAttr<DictArrayAttr>:$arg_attrs,
+                   OptionalAttr<DictArrayAttr>:$res_attrs);
   let results = (outs Variadic<AnyType>);
 
   let builders = [
 
@@ -256,6 +256,11 @@ LinearLayout chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
 // tensor into shared memory using the `ldmatrix` instruction.
 LinearLayout chooseLdMatrixLayout(Attribute enc, ArrayRef<int64_t> shape,
                                   bool needTrans, int32_t elemBitWidth);
+
+// The primary goal of this function is to efficiently load 2D tiles of a
+// tensor from shared memory using the `ds_read_tr` instruction for AMD GPUs.
+LinearLayout chooseDsReadB64Tr16Layout(Attribute enc, ArrayRef<int64_t> shape,
+                                       int32_t elemBitWidth);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -922,7 +922,7 @@ void FuncOp::build(OpBuilder &builder, OperationState &state, StringRef name,
   if (argAttrs.empty())
     return;
   assert(type.getNumInputs() == argAttrs.size());
-  function_interface_impl::addArgAndResultAttrs(
+  call_interface_impl::addArgAndResultAttrs(
       builder, state, argAttrs, /*resultAttrs=*/std::nullopt,
       getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));
 }
 
@@ -391,6 +391,135 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
 }
 
+LinearLayout chooseDotDsReadB64Tr16Layout(DotOperandEncodingAttr dotMfmaLayout,
+                                          ArrayRef<int64_t> shape,
+                                          int32_t elemBitWidth) {
+  auto mfmaLayout = llvm::cast<AMDMfmaEncodingAttr>(dotMfmaLayout.getParent());
+  assert(mfmaLayout.getMDim() == 16 || mfmaLayout.getNDim() == 32);
+  assert(elemBitWidth == 16);
+
+  auto rank = shape.size();
+  bool hasBatchDim = rank == 3;
+  int32_t kWidthDot = dotMfmaLayout.getKWidth();
+  // Number of bits loaded by an LDS read. ds_read_tr primarily supports 64-bit
+  // loads for most element sizes (16b, 8b, 4b).
+  const int32_t ldsReadWidth = 64;
+  int32_t kWidthTransRead = ldsReadWidth / elemBitWidth;
+  auto kDim = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
+
+  int32_t kSize = shape[kDim];
+  auto warpsPerCTA = mfmaLayout.getWarpsPerCTA();
+
+  MLIRContext *ctx = dotMfmaLayout.getContext();
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+
+  StringAttr kRegister = S("register");
+  StringAttr kLane = S("lane");
+  StringAttr kWarp = S("warp");
+
+  // register order
+  // operand A: [1, 0] / [2, 1, 0]
+  // operand B: [0, 1] / [1, 2, 0]
+  // Regular dot mfma order for both cases is [k, nonk]/[k, nonk, batch]
+  // For LDS transpose layout swap order to [nonk, k]/[nonk, k, batch]
+  SmallVector<unsigned> order = triton::gpu::getOrder(dotMfmaLayout);
+  std::swap(order[0], order[1]);
+
+  // In the LDS transpose logic, each thread accesses 64 bits (8 bytes) of data.
+  // The smallest unit for transposing is a 4x4 sub-tile of threads, where each
+  // thread reads 4 16-bit elements along the non-K dimension, resulting in a
+  // [non-K, K] = {16, 4}  sub-tile of elements. Because of transposing
+  // mechanism, thread ends up with 4 16-bit elements along K dim.
+  //
+  // The MFMA selection logic prioritizes double-rate MFMA instructions whenever
+  // possible. Specifically:
+  // - For MFMA operations that are non-K = 16, when blockK > 16, mfma16x16x32
+  // is selected; otherwise (blockK ≤ 16), mfma16x16x16 remains the choice.
+  // - For MFMA operations that are non-K = 32, when blockK > 8, mfma32x32x16 is
+  // selected; otherwise (blockK ≤ 8), mfma32x32x8 is used.
+  //
+  // In double-rate MFMA instructions, each thread holds 8 elements along the K
+  // dimension.
+  // - The first 4 elements belong to the first sub-tile.
+  // - The next 4 elements belong to the second sub-tile.
+  //
+  // We then group these into larger tiles, each consisting of 8 of these 16x4
+  // sub-tiles. These tiles correspond to data for one mfma instruction. The
+  // shapes of these tiles depend on the MFMA instruction used:
+  // 1. For mfma32x32x16, the tile shape is [non-K, K] = {32, 16}.
+  // 2. For mfma16x16x32, the tile shape is [non-K, K] = {16, 32}.
+  //
+  // For single-rate mfma instructions, each thread holds 4 elements along K
+  // dimension. This means larger tile (that corresponds to one mfma
+  // instruction) consists of 4 16x4 sub-tiles.
+  std::vector<std::vector<int32_t>> registerBase = {{1, 0},
+                                                    {2, 0}}; // first sub-tile
+  std::vector<std::vector<int32_t>> laneBase = {{kWidthTransRead, 0},
+                                                {2 * kWidthTransRead, 0},
+                                                {0, 1},
+                                                {0, 2}}; // first sub-tile
+
+  // Extend register base for multiple tiles in K dimension (corresponding to
+  // multiple mfma instructions accross k dim).
+  auto populateRegisterBase = [&](int kTileSize) {
+    const int regsPerTile = 8;
+    int numRegs = (kSize / kTileSize) * regsPerTile;
+    for (int reg = regsPerTile; reg < numRegs; reg *= 2) {
+      registerBase.push_back({0, (reg / regsPerTile) * kTileSize});
+    }
+  };
+
+  const bool isMfma32 = (mfmaLayout.getMDim() == 32);
+  const bool isMfma16 = (mfmaLayout.getMDim() == 16);
+  const int kTileSize = isMfma32 ? 16 : 32;
+
+  if (kSize >= kTileSize) {
+    // Handles mfma32x32x16 and mfma16x16x32 cases
+    assert(kWidthDot == 8);
+    registerBase.push_back({0, 4}); // second sub-tile
+    populateRegisterBase(kTileSize);
+    auto laneBaseExt = isMfma32
+                           ? std::vector<std::vector<int32_t>>{{16, 0}, {0, 8}}
+                           : std::vector<std::vector<int32_t>>{{0, 8}, {0, 16}};
+    laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+  } else {
+    // Handles mfma32x32x8 and mfma16x16x16 cases
+    assert(kWidthDot == 4);
+    auto laneBaseExt = isMfma32
+                           ? std::vector<std::vector<int32_t>>{{16, 0}, {0, 4}}
+                           : std::vector<std::vector<int32_t>>{{0, 4}, {0, 8}};
+    laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+  }
+
+  // Base vectors above are defined in a fixed order [non-k-dim, k-dim].
+  // To assign them to actual matrix dimensions `order` array is used.
+  // For operand A: non-k-dim -> dim0, k-dim -> dim1
+  // For operand B: non-k-dim -> dim1, k-dim -> dim0
+  LinearLayout tileLayout({{kRegister, registerBase}, {kLane, laneBase}},
+                          {outDimNames[order[0]], outDimNames[order[1]]});
+
+  if (hasBatchDim) {
+    assert(order[2] == 0);
+    // Extend the base vector with one value to accommodate for the batch
+    // dimension, which appears at the last.
+    tileLayout *= LinearLayout::identity1D(1, kRegister, outDimNames[order[2]]);
+    tileLayout *= LinearLayout::identity1D(1, kLane, outDimNames[order[2]]);
+  }
+
+  // warp order
+  // common for both operand A and B: [0, 1] / [0, 1, 2]
+  // in both cases it is [M dim, N dim]/[batch, M dim, N dim]
+  SmallVector<unsigned> warpOrder = triton::gpu::getWarpOrder(dotMfmaLayout);
+  LinearLayout warpLayout = identityStandardND(kWarp, warpsPerCTA, warpOrder);
+
+  LinearLayout ctaLayout = tileLayout.transposeOuts(outDimNames) *
+                           warpLayout.transposeOuts(outDimNames);
+  auto finalLayout =
+      combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
+
+  return finalLayout;
+}
+
 LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
                                    ArrayRef<int64_t> shape) {
 
@@ -1204,4 +1333,10 @@ LinearLayout chooseLdMatrixLayout(Attribute enc, ArrayRef<int64_t> shape,
   return chooseDotLdMatrixLayout(dot, shape, needTrans, elemBitWidth);
 }
 
+LinearLayout chooseDsReadB64Tr16Layout(Attribute enc, ArrayRef<int64_t> shape,
+                                       int32_t elemBitWidth) {
+  auto dot = cast<DotOperandEncodingAttr>(enc);
+  return chooseDotDsReadB64Tr16Layout(dot, shape, elemBitWidth);
+}
+
 } // namespace mlir::triton::gpu
@@ -7134,3 +7134,15 @@ def _simple_add(
         _simple_add[grid](x, x.stride(0), x.stride(1))
 
     assert torch.allclose(x, torch.ones_like(x) * c_dim)
+
+
+@pytest.mark.interpreter
+def test_aliasing(device):
+
+    @triton.jit
+    def aliasing_kernel(buffer, buffer2):
+        triton.language.store(buffer, 1)
+
+    buffer = torch.zeros(1, device=device)
+    aliasing_kernel[(1, )](buffer, buffer)
+    assert buffer[0] == 1
@@ -1157,16 +1157,22 @@ def __init__(self, fn, arg_names, grid):
         self.constexprs = [name for name in arg_names if __annotations__.get(name) == "constexpr"]
 
     def _init_args_hst(self, args_dev, kwargs):
+        storages = {}
 
         def _to_cpu(arg):
             if isinstance(arg, tuple):
                 return _tuple_create(arg, map(_to_cpu, arg))
             elif not hasattr(arg, "data_ptr"):
                 return arg
+
             unwrapped_arg = _unwrap_tensor(arg)
+            if unwrapped_arg.untyped_storage().data_ptr() not in storages:
+                storage = unwrapped_arg.untyped_storage()
+                storages[storage.data_ptr()] = storage.cpu()
+
+            storage = storages[unwrapped_arg.untyped_storage().data_ptr()]
             cpu_arg = unwrapped_arg.new_empty(0, device='cpu')
-            cpu_arg.set_(unwrapped_arg.untyped_storage().cpu(), unwrapped_arg.storage_offset(), unwrapped_arg.size(),
-                         unwrapped_arg.stride())
+            cpu_arg.set_(storage, unwrapped_arg.storage_offset(), unwrapped_arg.size(), unwrapped_arg.stride())
             cpu_arg = _rewrap_tensor(cpu_arg, original_tensor=arg)
             return cpu_arg
 
@@ -1175,21 +1181,17 @@ def _to_cpu(arg):
         # Process keyword arguments
         kwargs_hst = {}
         for key, value in kwargs.items():
-            if hasattr(value, "data_ptr"):
-                kwargs_hst[key] = value.cpu()
-            elif isinstance(value, tuple):
-                return _tuple_create(value, map(_to_cpu, value))
-            else:
-                kwargs_hst[key] = value
+            kwargs_hst[key] = _to_cpu(value)
         return args_hst, kwargs_hst
 
     def _restore_args_dev(self, args_dev, args_hst, kwargs, kwargs_hst):
+        storages = {}
 
         def _from_cpu(arg_dev, arg_hst):
             if hasattr(arg_dev, "data_ptr"):
                 # No need to rewrap because this just modifies internal
                 arg_dev, arg_hst = _unwrap_tensor(arg_dev), _unwrap_tensor(arg_hst)
-                arg_dev.untyped_storage().copy_(arg_hst.untyped_storage())
+                storages[arg_dev.untyped_storage().data_ptr()] = (arg_dev.untyped_storage(), arg_hst.untyped_storage())
             elif isinstance(arg_dev, tuple):
                 for (arg_dev, arg_hst) in zip(arg_dev, arg_hst):
                     _from_cpu(arg_dev, arg_hst)
@@ -1202,6 +1204,9 @@ def _from_cpu(arg_dev, arg_hst):
             kwarg_hst = kwargs_hst[key]
             _from_cpu(kwarg_dev, kwarg_hst)
 
+        for (arg_dev, arg_hst) in storages.values():
+            arg_dev.copy_(arg_hst)
+
     def __call__(self, *args_dev, **kwargs):
         if kwargs.pop("warmup", False):
             return
 
@@ -83,16 +83,16 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
                              %arg1: i32 {tt.divisibility = 16 : i32},
                              %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
     // The waitcnt stores all counters in one i32 bits 15:14 and 3:0 store the vmcnt we have to wait on
-    // CHECK: rocdl.waitcnt -49168
+    // CHECK: rocdl.s.waitcnt -49168
     // CHECK: rocdl.barrier
     ttg.async_wait {num = 0 : i32}
-    // CHECK: rocdl.waitcnt -49167
+    // CHECK: rocdl.s.waitcnt -49167
     // CHECK: rocdl.barrier
     ttg.async_wait {num = 1 : i32}
-    // CHECK: rocdl.waitcnt -2
+    // CHECK: rocdl.s.waitcnt -2
     // CHECK: rocdl.barrier
     ttg.async_wait {num = 62 : i32}
-    // CHECK: rocdl.waitcnt -1
+    // CHECK: rocdl.s.waitcnt -1
     // CHECK: rocdl.barrier
     ttg.async_wait {num = 63 : i32}
     tt.return
 
@@ -0,0 +1,77 @@
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s
+
+#mma16 = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 16], isTransposed = true}>
+#mma32 = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = true}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  //  CHECK-LABEL: ds_transpose_n_t_fp16_mfma_16
+  tt.func @ds_transpose_n_t_fp16_mfma_16(%arg0: !ttg.memdesc<128x64xf16, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-32: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared1, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_t_fp16_mfma_16
+  tt.func @ds_transpose_t_t_fp16_mfma_16(%arg0: !ttg.memdesc<128x64xf16, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<8xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared1, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    // CHECK-COUNT-16: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared1, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_n_n_fp16_mfma_16
+  tt.func @ds_transpose_n_n_fp16_mfma_16(%arg0: !ttg.memdesc<128x64xf16, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared, #smem, mutable>) {
+    // CHECK-COUNT-16: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<8xf16>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_n_fp16_mfma_16
+  tt.func @ds_transpose_t_n_fp16_mfma_16(%arg0: !ttg.memdesc<128x64xf16, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared, #smem, mutable>) {
+    // CHECK-NOT: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared1, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_n_t_fp16_mfma32
+  tt.func @ds_transpose_n_t_fp16_mfma32(%arg0: !ttg.memdesc<128x64xf16, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-32: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 8}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared1, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_t_fp16_mfma32
+  tt.func @ds_transpose_t_t_fp16_mfma32(%arg0: !ttg.memdesc<128x64xf16, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<8xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared1, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 8}>>
+    // CHECK-COUNT-16: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared1, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_n_n_fp16_mfma32
+  tt.func @ds_transpose_n_n_fp16_mfma32(%arg0: !ttg.memdesc<128x64xf16, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared, #smem, mutable>) {
+    // CHECK-COUNT-16: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 8}>>
+    // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<8xf16>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_n_fp16_mfma32
+  tt.func @ds_transpose_t_n_fp16_mfma32(%arg0: !ttg.memdesc<128x64xf16, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xf16, #shared, #smem, mutable>) {
+    // CHECK-NOT: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #shared1, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 8}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 8}>>
+    tt.return
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-ffe3129e9bdc146ee4d91e849173d1c64b1ae974`
	`1`	`+1188b1ff7b956cb65d8ddda5f1e56c432f1a57c7`
Original file line number	Diff line number	Diff line change
`@@ -922,7 +922,7 @@ void FuncOp::build(OpBuilder &builder, OperationState &state, StringRef name,`
`922`	`922`	`if (argAttrs.empty())`
`923`	`923`	`return;`
`924`	`924`	`assert(type.getNumInputs() == argAttrs.size());`
`925`		`- function_interface_impl::addArgAndResultAttrs(`
	`925`	`+ call_interface_impl::addArgAndResultAttrs(`
`926`	`926`	`builder, state, argAttrs, /resultAttrs=/std::nullopt,`
`927`	`927`	`getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));`
`928`	`928`	`}`