From 133109de1248ddf934fd46409dece99d78ef59d5 Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Thu, 21 Nov 2024 13:45:16 -0800
Subject: [PATCH 01/12] [DRAFT] Completely remove `MemDesc` from the Triton
 dialect (#5208)

After this PR, `MemDesc` will be a type only in the TritonGPU dialect,
as will the `TensorOrMemDesc` interface.
---
 .../TritonGPUToLLVM/TypeConverter.h           |   5 +-
 .../Conversion/TritonGPUToLLVM/Utility.h      |  10 +-
 .../triton/Dialect/Triton/IR/CMakeLists.txt   |   4 -
 include/triton/Dialect/Triton/IR/Traits.h     |   6 +-
 include/triton/Dialect/Triton/IR/TritonOps.td |   1 -
 .../triton/Dialect/Triton/IR/TritonTypes.td   |  48 ---
 include/triton/Dialect/Triton/IR/Types.h      |   2 -
 .../triton/Dialect/TritonGPU/IR/Attributes.h  |   2 +-
 .../Dialect/TritonGPU/IR/CMakeLists.txt       |  13 +-
 include/triton/Dialect/TritonGPU/IR/Dialect.h |   2 +-
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td |   2 +-
 .../TritonGPU/IR/TritonGPUInterfaces.h        |   5 +-
 .../Dialect/TritonGPU/IR/TritonGPUOps.td      |  29 +-
 .../IR/TritonGPUTypeInterfaces.td}            |  11 +-
 .../Dialect/TritonGPU/IR/TritonGPUTypes.td    |  55 ++-
 include/triton/Dialect/TritonGPU/IR/Types.h   |   3 +
 .../TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td  |  25 +-
 lib/Analysis/Alias.cpp                        |   4 +-
 lib/Analysis/CMakeLists.txt                   |   2 +
 lib/Analysis/Utility.cpp                      |   3 +-
 .../SharedToDotOperandFMA.cpp                 |   1 +
 .../TritonGPUToLLVM/TypeConverter.cpp         |   1 +
 lib/Conversion/TritonGPUToLLVM/Utility.cpp    |  16 +-
 .../TritonGPUToLLVM/ViewOpToLLVM.cpp          |   3 +-
 lib/Dialect/Triton/IR/Dialect.cpp             |  42 --
 lib/Dialect/Triton/IR/Types.cpp               |  55 ---
 lib/Dialect/TritonGPU/IR/CMakeLists.txt       |   1 +
 lib/Dialect/TritonGPU/IR/Dialect.cpp          |  51 ++-
 lib/Dialect/TritonGPU/IR/Types.cpp            |  55 +++
 .../Pipeliner/MatmulLoopPipeline.cpp          |  64 +--
 .../Pipeliner/PipeliningUtility.cpp           |   6 +-
 .../Pipeliner/TMAStoresPipeline.cpp           |   4 +-
 lib/Dialect/TritonGPU/Transforms/Prefetch.cpp |   6 +-
 .../Transforms/ReduceDataDuplication.cpp      |   2 +-
 lib/Dialect/TritonGPU/Transforms/Utility.cpp  |   7 +-
 lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp        |  20 +-
 python/test/unit/language/test_core.py        |   8 +-
 test/Analysis/test-alias.mlir                 |  86 ++--
 test/Analysis/test-allocation.mlir            | 302 +++++++--------
 test/Analysis/test-membar.mlir                | 366 +++++++++---------
 test/Conversion/amd/compute-base-ptr.mlir     |   4 +-
 .../decompose-unsupported-conversions.mlir    |   4 +-
 test/Conversion/amd/tritongpu_to_llvm.mlir    |  10 +-
 .../amd/tritongpu_wmma_dot_to_llvm.mlir       |  18 +-
 test/Conversion/tritongpu_to_llvm.mlir        | 100 ++---
 test/Conversion/tritongpu_to_llvm_hopper.mlir |  36 +-
 test/Conversion/tritonnvidiagpu_to_llvm.mlir  |  16 +-
 test/Triton/invalid.mlir                      |   6 +-
 test/TritonGPU/accumulator-init.mlir          |  70 ++--
 .../amd/amd-reorder-instructions.mlir         | 136 +++----
 test/TritonGPU/amd/amd-sched-2nd-load.mlir    |  58 +--
 test/TritonGPU/amd/optimize-lds-usage.mlir    |  28 +-
 test/TritonGPU/canonicalize.mlir              |  34 +-
 test/TritonGPU/combine.mlir                   |  12 +-
 test/TritonGPU/dot-operands.mlir              |  32 +-
 test/TritonGPU/fence-inserstion.mlir          |  12 +-
 test/TritonGPU/invalid.mlir                   |  20 +-
 test/TritonGPU/loop-pipeline-cuda.mlir        |  24 +-
 test/TritonGPU/loop-pipeline-hip.mlir         |  18 +-
 test/TritonGPU/loop-pipeline-hopper.mlir      |  80 ++--
 test/TritonGPU/loop-pipeline.mlir             |  14 +-
 .../pipeline-hopper-remove-wait.mlir          |  10 +-
 test/TritonGPU/prefetch.mlir                  |  48 +--
 test/TritonGPU/reduce-data-duplication.mlir   |   2 +-
 test/TritonGPU/reorder-instructions.mlir      |  46 +--
 test/TritonGPU/tritongpu_ops.mlir             |   4 +-
 test/TritonNvidiaGPU/membar.mlir              |  26 +-
 .../ConvertLayoutOpToLLVM.cpp                 |   1 +
 .../SharedToDotOperandMFMA.cpp                |   2 +-
 .../SharedToDotOperandWMMA.cpp                |   2 +-
 .../StreamPipelineV2.cpp                      |  17 +-
 .../SharedToDotOperandMMAv2OrV3.cpp           |   1 +
 .../DecomposeUnsupportedConversions.cpp       |   2 +-
 .../DotOpToLLVM/WGMMA.cpp                     |   9 +-
 74 files changed, 1130 insertions(+), 1100 deletions(-)
 rename include/triton/Dialect/{Triton/IR/TritonTypeInterfaces.td => TritonGPU/IR/TritonGPUTypeInterfaces.td} (75%)

diff --git a/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h b/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h
index 5ae547c392..60c0ed7b61 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h
@@ -18,11 +18,12 @@ class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter {
                                const TargetInfoBase &targetInfo,
                                const DataLayoutAnalysis *analysis = nullptr);
 
-  Type getElementTypeForStruct(TensorOrMemDesc type);
+  Type getElementTypeForStruct(triton::gpu::TensorOrMemDesc type);
   Type convertTritonPointerType(triton::PointerType type);
   Type convertTritonTensorType(RankedTensorType type,
                                const TargetInfoBase &targetInfo);
-  Type convertMemDescType(MemDescType type, const TargetInfoBase &targetInfo);
+  Type convertMemDescType(triton::gpu::MemDescType type,
+                          const TargetInfoBase &targetInfo);
   Type convertAsyncToken(triton::gpu::AsyncTokenType type);
 };
 
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
index 9061e24fbe..ba24461a1f 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -14,6 +14,7 @@
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/Types.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
@@ -1141,8 +1142,8 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
 //
 // Returns true on success.
 [[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, Value shmemBase,
+    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
+    Type elemLlvmTy, std::optional<int32_t> maxVecElems, Value shmemBase,
     ArrayRef<Value> shmemStrides, Location loc, RewriterBase &rewriter,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
@@ -1310,13 +1311,14 @@ inline DenseMap<unsigned, Value> getSwizzledSharedPtrs(
 }
 
 SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
-                                           MemDescType srcTy, Type elemLlvmTy,
+                                           triton::gpu::MemDescType srcTy,
+                                           Type elemLlvmTy,
                                            SharedMemoryObject smemObj,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target);
 
 void storeDistributedToShared(
-    MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
+    triton::gpu::MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
     ArrayRef<Value> srcVals, Value smemBase, ArrayRef<Value> dstStrides,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::pair<size_t, Type> *const llvmOpCount = nullptr);
diff --git a/include/triton/Dialect/Triton/IR/CMakeLists.txt b/include/triton/Dialect/Triton/IR/CMakeLists.txt
index 8139ebf1ae..fecd5adf62 100644
--- a/include/triton/Dialect/Triton/IR/CMakeLists.txt
+++ b/include/triton/Dialect/Triton/IR/CMakeLists.txt
@@ -20,10 +20,6 @@ set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td)
 mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls)
 mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs)
 
-set(LLVM_TARGET_DEFINITIONS TritonTypeInterfaces.td)
-mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls)
-mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs)
-
 set(LLVM_TARGET_DEFINITIONS TritonOpInterfaces.td)
 mlir_tablegen(OpInterfaces.h.inc -gen-op-interface-decls)
 mlir_tablegen(OpInterfaces.cpp.inc -gen-op-interface-defs)
diff --git a/include/triton/Dialect/Triton/IR/Traits.h b/include/triton/Dialect/Triton/IR/Traits.h
index 7f0e5109e6..804b1648e9 100644
--- a/include/triton/Dialect/Triton/IR/Traits.h
+++ b/include/triton/Dialect/Triton/IR/Traits.h
@@ -69,9 +69,9 @@ class DotLike : public TraitBase<ConcreteType, DotLike> {
   static LogicalResult verifyTrait(Operation *op) {
     if (op->getNumOperands() < 3)
       return op->emitOpError("expected at least 3 operands");
-    auto aTy = cast<TensorOrMemDesc>(op->getOperand(0).getType());
-    auto bTy = cast<TensorOrMemDesc>(op->getOperand(1).getType());
-    auto cTy = cast<TensorOrMemDesc>(op->getOperand(2).getType());
+    auto aTy = cast<ShapedType>(op->getOperand(0).getType());
+    auto bTy = cast<ShapedType>(op->getOperand(1).getType());
+    auto cTy = cast<ShapedType>(op->getOperand(2).getType());
     auto aShape = aTy.getShape();
     auto bShape = bTy.getShape();
     auto cShape = cTy.getShape();
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
index cdf9b0ea77..197b9df7cf 100644
--- a/include/triton/Dialect/Triton/IR/TritonOps.td
+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -13,7 +13,6 @@ include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
 include "mlir/Interfaces/ControlFlowInterfaces.td" // BranchOpInterface
 include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
 include "mlir/Interfaces/CallInterfaces.td" // CallOpInterface
-include "triton/Dialect/Triton/IR/TritonTypeInterfaces.td"
 include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
 
 
diff --git a/include/triton/Dialect/Triton/IR/TritonTypes.td b/include/triton/Dialect/Triton/IR/TritonTypes.td
index 98f8e570a9..a70b97dbc8 100644
--- a/include/triton/Dialect/Triton/IR/TritonTypes.td
+++ b/include/triton/Dialect/Triton/IR/TritonTypes.td
@@ -92,54 +92,6 @@ def TT_TensorPtr : TT_PtrOf<[TT_Tensor]>;
 // Any Type in Triton IR
 def TT_Type : AnyTypeOf<[TT_FloatLike, TT_IntLike, TT_PtrLike, TT_TensorPtr]>;
 
-// Memory descriptor type.
-def TT_MemDescType : TritonTypeDef<"MemDesc", "memdesc", [ShapedTypeInterface]> {
-    let summary = "memory descriptor type (`::mlir::triton::MemDescType`) in Triton IR type system";
-
-    let description = [{
-        Memory descriptor contains a base pointer (scalar) and a descriptor of the memory.
-        If mutable memory is false that means the memory is constant and can only be allocated and stored once.
-        A constant memory allocation is different than a tensor as it can have multiple views and the descriptor
-        can be changed without changing the underlying memory.
-    }];
-
-  let parameters = (ins
-    ArrayRefParameter<"int64_t">:$shape,
-    "Type":$elementType,
-    "Attribute":$encoding,
-    "Attribute":$memorySpace,
-    "bool":$mutable_memory
-  );
-  let extraClassDeclaration = [{
-    MemDescType cloneWith(std::optional<ArrayRef<int64_t>> shape,
-                          Type elementType) const {
-        return MemDescType::get(shape.value_or(getShape()), elementType, getEncoding(), getMemorySpace(), getMutableMemory());
-    }
-
-    bool hasRank() const { return true; }
-  }];
-  let builders = [
-        TypeBuilderWithInferredContext<(ins
-            "llvm::ArrayRef<int64_t>":$shape,
-            "Type":$elementType,
-            "Attribute":$encoding,
-            "Attribute":$memorySpace
-        ), [{
-            return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, /*mutableMemory=*/false);
-        }]>,
-        TypeBuilderWithInferredContext<(ins
-            "llvm::ArrayRef<int64_t>":$shape,
-            "Type":$elementType,
-            "Attribute":$encoding,
-            "Attribute":$memorySpace,
-            "bool":$mutableMemory
-        ), [{
-            return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, mutableMemory);
-        }]>
-    ];
-  let hasCustomAssemblyFormat = 1;
-}
-
 // Result type of ExperimentalMakeTensorDescriptor
 def TT_TensorDescType : TritonTypeDef<"TensorDesc", "tensordesc", []> {
   let summary = "Tensor descriptor type (`::mlir::triton::TensorDescType`) in Triton IR type system";
diff --git a/include/triton/Dialect/Triton/IR/Types.h b/include/triton/Dialect/Triton/IR/Types.h
index 17d2dbc8cc..6bcac9522e 100644
--- a/include/triton/Dialect/Triton/IR/Types.h
+++ b/include/triton/Dialect/Triton/IR/Types.h
@@ -8,8 +8,6 @@
 #define GET_TYPEDEF_CLASSES
 #include "triton/Dialect/Triton/IR/Types.h.inc"
 
-#include "triton/Dialect/Triton/IR/TypeInterfaces.h.inc"
-
 namespace mlir {
 
 namespace triton {
diff --git a/include/triton/Dialect/TritonGPU/IR/Attributes.h b/include/triton/Dialect/TritonGPU/IR/Attributes.h
index a99ddfc17d..1f93b3d935 100644
--- a/include/triton/Dialect/TritonGPU/IR/Attributes.h
+++ b/include/triton/Dialect/TritonGPU/IR/Attributes.h
@@ -5,6 +5,6 @@
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 
 #define GET_ATTRDEF_CLASSES
-#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc"
+#include "triton/Dialect/TritonGPU/IR/AttrDefs.h.inc"
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
diff --git a/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt b/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt
index 73c9401c18..189f6d4307 100644
--- a/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt
+++ b/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt
@@ -12,10 +12,15 @@ add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
 add_public_tablegen_target(TritonGPUTableGen)
 
 set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td)
-mlir_tablegen(TritonGPUAttrInterfaces.h.inc -gen-attr-interface-decls)
-mlir_tablegen(TritonGPUAttrInterfaces.cpp.inc -gen-attr-interface-defs)
-mlir_tablegen(TritonGPUAttrDefs.h.inc -gen-attrdef-decls)
-mlir_tablegen(TritonGPUAttrDefs.cpp.inc -gen-attrdef-defs)
+mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls)
+mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs)
+mlir_tablegen(AttrDefs.h.inc -gen-attrdef-decls)
+mlir_tablegen(AttrDefs.cpp.inc -gen-attrdef-defs)
 mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(TritonGPUAttrDefsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS TritonGPUTypeInterfaces.td)
+mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls)
+mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs)
+add_public_tablegen_target(TritonGPUTypeInterfacesIncGen)
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
index e0865e12af..b09fc29fb3 100644
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -9,10 +9,10 @@
 // TritonGPU depends on Triton
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h.inc"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
 
 #define GET_OP_CLASSES
+#include "triton/Dialect/TritonGPU/IR/Dialect.h.inc"
 #include "triton/Dialect/TritonGPU/IR/Ops.h.inc"
 
 namespace mlir {
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index 26ff9f7e3a..9af550aae9 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -2,8 +2,8 @@
 #define TRITONGPU_ATTRDEFS
 
 include "mlir/IR/AttrTypeBase.td"
-include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 
 //===----------------------------------------------------------------------===//
 // TritonGPU Attribute Definitions
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h b/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h
index 9cf2876d2c..1e76237dac 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h
@@ -1,6 +1,9 @@
 #ifndef TRITON_GPU_DIALECT_INTERFACES_H
 #define TRITON_GPU_DIALECT_INTERFACES_H
+
+// clang-format off
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
-#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.h.inc"
+#include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc"
+// clang-format on
 
 #endif // TRITON_GPU_DIALECT_INTERFACES_H
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
index 77cb2c8bf0..b747fddde6 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -3,6 +3,7 @@
 
 include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
@@ -95,7 +96,7 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
 
   let arguments = (
     ins TT_PtrTensor:$src,
-    TT_MemDescType:$result,
+    TTG_MemDescType:$result,
     Optional<I1Tensor>:$mask,
     Optional<TT_Type>:$other,
     DefaultValuedAttr<TT_CacheModifierAttr, "triton::CacheModifier::NONE">:$cache,
@@ -168,7 +169,7 @@ def TTG_LocalAllocOp : TTG_Op<"local_alloc", [DeclareOpInterfaceMethods<MemoryEf
   }];
   let assemblyFormat = [{$src attr-dict `:` functional-type(operands, results)}];
 
-  let results = (outs TT_MemDescType:$result);
+  let results = (outs TTG_MemDescType:$result);
   let hasFolder = 1;
   let hasVerifier = 1;
 }
@@ -191,9 +192,9 @@ def TTG_LocalDeallocOp : TTG_Op<"local_dealloc", [MemoryEffects<[MemFree<SharedM
     operand.
   }];
 
-  let arguments = (ins TT_MemDescType:$src);
+  let arguments = (ins TTG_MemDescType:$src);
 
-  // Use qualified() otherwise "!tt.memdesc<X>" is printed as "<X>".
+  // Use qualified() otherwise "!triton_gpu.memdesc<X>" is printed as "<X>".
   let assemblyFormat = [{$src attr-dict `:` qualified(type($src))}];
 }
 
@@ -212,12 +213,12 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure]> {
     Then in Python syntax, the subview covers input[1][0:4][4:8].
   }];
   let arguments = (
-    ins TT_MemDescType:$src, Variadic<I32>:$offsets);
+    ins TTG_MemDescType:$src, Variadic<I32>:$offsets);
 
-  // Use qualified() otherwise "!tt.memdesc<X>" is printed as "<X>".
+  // Use qualified() otherwise "!triton_gpu.memdesc<X>" is printed as "<X>".
   let assemblyFormat = [{$src `[` $offsets `]` attr-dict `:` qualified(type($src)) `->` qualified(type($result))}];
 
-  let results = (outs TT_MemDescType:$result);
+  let results = (outs TTG_MemDescType:$result);
 
   let hasVerifier = 1;
 }
@@ -233,14 +234,14 @@ def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure,
     representing a transposed view of the buffer.
   }];
 
-  let arguments = (ins TT_MemDescType:$src, Variadic<I32>:$order);
+  let arguments = (ins TTG_MemDescType:$src, Variadic<I32>:$order);
 
   let arguments = (
-    ins TT_MemDescType:$src,
+    ins TTG_MemDescType:$src,
     DenseI32ArrayAttr:$order
   );
 
-  let results = (outs TT_MemDescType:$result);
+  let results = (outs TTG_MemDescType:$result);
 
   let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))";
 
@@ -253,7 +254,7 @@ def TTG_LocalLoadOp : TTG_Op<"local_load", [DeclareOpInterfaceMethods<MemoryEffe
   let description = [{
     Load a tensor from the local memory descriptor into a distributed tensor.
   }];
-  let arguments = (ins TT_MemDescType:$src, Optional<TTG_AsyncToken> :$token);
+  let arguments = (ins TTG_MemDescType:$src, Optional<TTG_AsyncToken> :$token);
 
   let builders = [
       OpBuilder<(ins "Type":$retType, "Value":$src),
@@ -261,7 +262,7 @@ def TTG_LocalLoadOp : TTG_Op<"local_load", [DeclareOpInterfaceMethods<MemoryEffe
       build($_builder, $_state, retType, src, /*token=*/static_cast<mlir::Value>(nullptr));
       }]>];
 
-  // Use qualified() otherwise "!tt.memdesc<X>" is printed as "<X>".
+  // Use qualified() otherwise "!triton_gpu.memdesc<X>" is printed as "<X>".
   let assemblyFormat = [{$src (`token` $token^)? attr-dict `:` qualified(type($src)) `->` type($result)}];
 
   let results = (outs TT_Tensor:$result);
@@ -273,10 +274,10 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods<MemoryEf
   let description = [{
     Store a distributed tensor into a buffer in local memory.
   }];
-  let arguments = (ins TT_Tensor:$src, TT_MemDescType:$dst);
+  let arguments = (ins TT_Tensor:$src, TTG_MemDescType:$dst);
 
   let hasVerifier = 1;
-  // Use qualified() otherwise "!tt.memdesc<X>" is printed as "<X>".
+  // Use qualified() otherwise "!triton_gpu.memdesc<X>" is printed as "<X>".
   let assemblyFormat = [{
     $src `,` $dst attr-dict `:` type($src) `->` qualified(type($dst))
   }];
diff --git a/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td
similarity index 75%
rename from include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td
rename to include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td
index e3aed22627..a0415b62c6 100644
--- a/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td
@@ -1,11 +1,11 @@
-#ifndef TRITON_TYPE_INTERFACES
-#define TRITON_TYPE_INTERFACES
+#ifndef TRITON_GPU_TYPE_INTERFACES
+#define TRITON_GPU_TYPE_INTERFACES
 
 include "mlir/IR/OpBase.td"
 
 // Interface dynamically attached to RankedTensorType and MemDescType.
-def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> {
-  let cppNamespace = "::mlir";
+def TTG_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> {
+  let cppNamespace = "::mlir::triton::gpu";
   let methods = [
     InterfaceMethod<"Returns the encoding of the tensor or memory descriptor",
       "mlir::Attribute", "getEncoding", (ins)>,
@@ -17,8 +17,7 @@ def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> {
       "int64_t", "getRank", (ins)>,
     InterfaceMethod<"Returns the element type bit width",
       "int64_t", "getElementTypeBitWidth", (ins)>,
-
   ];
 }
 
-#endif // TRITON_TYPE_INTERFACES
+#endif // TRITON_GPU_TYPE_INTERFACES
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td
index 6765ac40cb..766d5a9bd7 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td
@@ -1,8 +1,9 @@
 #ifndef TRITONGPU_TYPES
 #define TRITONGPU_TYPES
 
-include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 
 class TTG_TypeDef<string name, string _mnemonic, list<Trait> traits = []>
     : TypeDef<TritonGPU_Dialect, name, traits> {
@@ -23,8 +24,7 @@ def TTG_TokenType : TTG_TypeDef<"Token", "token"> {
   let skipDefaultBuilders = 1;
 }
 
-def TTG_AsyncToken : TTG_TypeDef<"AsyncToken",
-                                    "async.token", []> {
+def TTG_AsyncToken : TTG_TypeDef<"AsyncToken", "async.token", []> {
   let summary = "async token type";
   let description = [{
     `ttg.async.token` is a type returned by an asynchronous operation.
@@ -33,4 +33,53 @@ def TTG_AsyncToken : TTG_TypeDef<"AsyncToken",
   }];
 }
 
+// Memory descriptor type.
+def TTG_MemDescType : TTG_TypeDef<"MemDesc", "memdesc", [ShapedTypeInterface]> {
+    let summary = "memory descriptor type (`::mlir::triton::gpu::MemDescType`) in Triton IR type system";
+
+    let description = [{
+        Memory descriptor contains a base pointer (scalar) and a descriptor of the memory.
+        If mutable memory is false that means the memory is constant and can only be allocated and stored once.
+        A constant memory allocation is different than a tensor as it can have multiple views and the descriptor
+        can be changed without changing the underlying memory.
+    }];
+
+  let parameters = (ins
+    ArrayRefParameter<"int64_t">:$shape,
+    "Type":$elementType,
+    "Attribute":$encoding,
+    "Attribute":$memorySpace,
+    "bool":$mutable_memory
+  );
+  let extraClassDeclaration = [{
+    MemDescType cloneWith(std::optional<ArrayRef<int64_t>> shape,
+                          Type elementType) const {
+        return MemDescType::get(shape.value_or(getShape()), elementType, getEncoding(), getMemorySpace(), getMutableMemory());
+    }
+
+    bool hasRank() const { return true; }
+  }];
+  let builders = [
+        TypeBuilderWithInferredContext<(ins
+            "llvm::ArrayRef<int64_t>":$shape,
+            "Type":$elementType,
+            "Attribute":$encoding,
+            "Attribute":$memorySpace
+        ), [{
+            return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, /*mutableMemory=*/false);
+        }]>,
+        TypeBuilderWithInferredContext<(ins
+            "llvm::ArrayRef<int64_t>":$shape,
+            "Type":$elementType,
+            "Attribute":$encoding,
+            "Attribute":$memorySpace,
+            "bool":$mutableMemory
+        ), [{
+            return $_get(elementType.getContext(), shape, elementType, encoding, memorySpace, mutableMemory);
+        }]>
+    ];
+  let hasCustomAssemblyFormat = 1;
+}
+
+
 #endif
diff --git a/include/triton/Dialect/TritonGPU/IR/Types.h b/include/triton/Dialect/TritonGPU/IR/Types.h
index edf37fef60..82ab3ae457 100644
--- a/include/triton/Dialect/TritonGPU/IR/Types.h
+++ b/include/triton/Dialect/TritonGPU/IR/Types.h
@@ -1,10 +1,13 @@
 #ifndef TRITONGPU_IR_TYPES_H_
 #define TRITONGPU_IR_TYPES_H_
 
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeSupport.h"
 #include "mlir/IR/Types.h"
 
 #define GET_TYPEDEF_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Types.h.inc"
 
+#include "triton/Dialect/TritonGPU/IR/TypeInterfaces.h.inc"
+
 #endif // TRITON_IR_TYPES_H_
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
index e257e8fead..f363032a37 100644
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -28,7 +28,8 @@ include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
-include "triton/Dialect/Triton/IR/TritonTypeInterfaces.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
 include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
@@ -80,8 +81,8 @@ def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [DeclareOpInterfaceMethods<I
         $d = matrix_multiply($a, $b) + $c. For docs on InputPrecisionAttr, see TT_DotOp
     }];
 
-    let arguments = (ins TT_TensorOrMemDesc:$a,
-                         TT_TensorOrMemDesc:$b,
+    let arguments = (ins TTG_TensorOrMemDesc:$a,
+                         TTG_TensorOrMemDesc:$b,
                          TT_FpIntTensor:$c,
                          Optional<I1>:$useC,
                          DefaultValuedAttr<TT_InputPrecisionAttr, "::mlir::triton::InputPrecision::IEEE">:$inputPrecision,
@@ -100,8 +101,8 @@ def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [DeclareOpInterfaceMethods<I
 def TTNG_WarpGroupDotWaitOp : TTNG_Op<"warp_group_dot_wait", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
                                                               AllTypesMatch<["inputs", "outputs"]>]> {
   let summary = "warp group dot wait";
-  let arguments = (ins Variadic<TT_TensorOrMemDesc>:$inputs, I32Attr:$pendings);
-  let results = (outs Variadic<TT_TensorOrMemDesc>:$outputs);
+  let arguments = (ins Variadic<TTG_TensorOrMemDesc>:$inputs, I32Attr:$pendings);
+  let results = (outs Variadic<TTG_TensorOrMemDesc>:$outputs);
   let description = [{
     Waits until there are $pendings or fewer outstanding async dot operations.
 
@@ -125,7 +126,7 @@ def TTNG_InitBarrierOp : TTNG_Op<"init_barrier", [DeclareOpInterfaceMethods<Memo
     }];
 
     let hasVerifier = 1;
-    let arguments = (ins TT_MemDescType:$alloc,
+    let arguments = (ins TTG_MemDescType:$alloc,
                          I32Attr:$count);
     let assemblyFormat = "$alloc `,` $count attr-dict `:` type($alloc)";
 }
@@ -141,7 +142,7 @@ def TTNG_InvalBarrierOp : TTNG_Op<"inval_barrier", [DeclareOpInterfaceMethods<Me
     }];
 
     let hasVerifier = 1;
-    let arguments = (ins TT_MemDescType:$alloc);
+    let arguments = (ins TTG_MemDescType:$alloc);
     let assemblyFormat = "$alloc attr-dict `:` type($alloc)";
 }
 
@@ -155,7 +156,7 @@ def TTNG_BarrierExpectOp : TTNG_Op<"barrier_expect", [DeclareOpInterfaceMethods<
 
   let hasVerifier = 1;
   let arguments = (
-    ins TT_MemDescType:$alloc,
+    ins TTG_MemDescType:$alloc,
     I32Attr:$size,
     I1:$pred
   );
@@ -180,7 +181,7 @@ def TTNG_WaitBarrierOp : TTNG_Op<"wait_barrier", [DeclareOpInterfaceMethods<Memo
     }];
 
     let hasVerifier = 1;
-    let arguments = (ins TT_MemDescType:$alloc,
+    let arguments = (ins TTG_MemDescType:$alloc,
                          I32:$phase);
     let assemblyFormat = "$alloc `,` $phase attr-dict `:` type($alloc)";
 }
@@ -221,8 +222,8 @@ def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local",
   let arguments = (
     ins TT_PtrType:$desc_ptr,
     Variadic<I32>:$coord,
-    TT_MemDescType:$barrier,
-    TT_MemDescType:$result,
+    TTG_MemDescType:$barrier,
+    TTG_MemDescType:$result,
     I1:$pred,
     DefaultValuedAttr<TT_CacheModifierAttr, "triton::CacheModifier::NONE">:$cache,
     DefaultValuedAttr<TT_EvictionPolicyAttr, "triton::EvictionPolicy::NORMAL">:$evict,
@@ -250,7 +251,7 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global",
   let arguments = (
     ins TT_PtrType:$desc_ptr,
     Variadic<I32>:$coord,
-    TT_MemDescType:$src);
+    TTG_MemDescType:$src);
 
   let assemblyFormat = [{
     $desc_ptr `[` $coord `]` $src
diff --git a/lib/Analysis/Alias.cpp b/lib/Analysis/Alias.cpp
index 3a141a73fa..020f513bac 100644
--- a/lib/Analysis/Alias.cpp
+++ b/lib/Analysis/Alias.cpp
@@ -28,7 +28,7 @@ LogicalResult SharedMemoryAliasAnalysis::visitOperation(
   bool pessimistic = true;
   auto result = op->getResult(0);
   // skip ops that return memdesc in a different memory space.
-  if (auto memdescTy = dyn_cast<triton::MemDescType>(result.getType())) {
+  if (auto memdescTy = dyn_cast<triton::gpu::MemDescType>(result.getType())) {
     if (!isa_and_nonnull<triton::gpu::SharedMemorySpaceAttr>(
             memdescTy.getMemorySpace()))
       return success();
@@ -43,7 +43,7 @@ LogicalResult SharedMemoryAliasAnalysis::visitOperation(
     aliasInfo = AliasInfo(operands[0]->getValue());
     pessimistic = false;
   } else {
-    assert(!isa<triton::MemDescType>(result.getType()) &&
+    assert(!isa<triton::gpu::MemDescType>(result.getType()) &&
            "unknown operation creating memory descriptor");
   }
 
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index a84f0649b6..693d222f2f 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -7,7 +7,9 @@ add_triton_library(TritonAnalysis
 
   DEPENDS
   TritonTableGen
+  TritonGPUTableGen
   TritonGPUAttrDefsIncGen
+  TritonGPUTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index ac72b4f26c..6166e10199 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -533,7 +533,8 @@ bool supportMMA(Value value, int version) {
   // types of both the operands are identical here.
   assert((version == 1 || version == 2 || version == 3) &&
          "Unexpected MMA layout version found");
-  auto elemTy = cast<TensorOrMemDesc>(value.getType()).getElementType();
+  auto elemTy =
+      cast<triton::gpu::TensorOrMemDesc>(value.getType()).getElementType();
   // FP8 is not natively supported on all mma versions but it can always be
   // promoted to fp16 therefore we can always support it.
   bool isFP8 = elemTy.isFloat8E5M2() || elemTy.isFloat8E4M3FN() ||
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
index be2e6f584f..4914fd712b 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
@@ -12,6 +12,7 @@ using ::mlir::triton::gpu::getOrder;
 using ::mlir::triton::gpu::getShapePerCTA;
 using ::mlir::triton::gpu::getSizePerThread;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
+using ::mlir::triton::gpu::MemDescType;
 using ::mlir::triton::gpu::SharedEncodingAttr;
 
 SmallVector<Value>
diff --git a/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp b/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp
index fee10296c8..1ed1f40635 100644
--- a/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp
@@ -11,6 +11,7 @@ using namespace mlir::triton;
 using ::mlir::triton::gpu::BlockedEncodingAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
+using ::mlir::triton::gpu::MemDescType;
 using ::mlir::triton::gpu::NvidiaMmaEncodingAttr;
 using ::mlir::triton::gpu::SharedEncodingAttr;
 using ::mlir::triton::gpu::SliceEncodingAttr;
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 6597ec8a30..c681cd344c 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -159,8 +159,8 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
 }
 
 bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, Value shmemBase,
+    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
+    Type elemLlvmTy, std::optional<int32_t> maxVecElems, Value shmemBase,
     ArrayRef<Value> shmemStrides, Location loc, RewriterBase &rewriter,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
@@ -272,7 +272,8 @@ bool emitTransferBetweenRegistersAndShared(
 }
 
 SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
-                                           MemDescType srcTy, Type elemLlvmTy,
+                                           triton::gpu::MemDescType srcTy,
+                                           Type elemLlvmTy,
                                            SharedMemoryObject smemObj,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target) {
@@ -295,10 +296,11 @@ SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
   return ret;
 }
 
-void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
-                              Type elemLlvmTy, ArrayRef<Value> srcVals,
-                              Value smemBase, ArrayRef<Value> dstStrides,
-                              Location loc, RewriterBase &rewriter,
+void storeDistributedToShared(triton::gpu::MemDescType dstTy,
+                              RankedTensorType srcTy, Type elemLlvmTy,
+                              ArrayRef<Value> srcVals, Value smemBase,
+                              ArrayRef<Value> dstStrides, Location loc,
+                              RewriterBase &rewriter,
                               const TargetInfoBase &target,
                               std::pair<size_t, Type> *const llvmOpCount) {
   bool success = emitTransferBetweenRegistersAndShared(
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
index aa8840433e..ea05490c7a 100644
--- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -1,7 +1,8 @@
 #include "mlir/Support/LLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
+#include "triton/Dialect/TritonGPU/IR/Types.h"
 
 using namespace mlir;
 using namespace mlir::triton;
diff --git a/lib/Dialect/Triton/IR/Dialect.cpp b/lib/Dialect/Triton/IR/Dialect.cpp
index f978958512..2874a3f564 100644
--- a/lib/Dialect/Triton/IR/Dialect.cpp
+++ b/lib/Dialect/Triton/IR/Dialect.cpp
@@ -14,7 +14,6 @@
 #include "triton/Dialect/Triton/IR/AttrInterfaces.cpp.inc"
 #include "triton/Dialect/Triton/IR/Dialect.cpp.inc"
 #include "triton/Dialect/Triton/IR/OpInterfaces.cpp.inc"
-#include "triton/Dialect/Triton/IR/TypeInterfaces.cpp.inc"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -78,44 +77,6 @@ struct TritonInlinerInterface : public DialectInlinerInterface {
   }
 };
 
-struct TensorModel
-    : public TensorOrMemDesc::ExternalModel<TensorModel, RankedTensorType> {
-  Type getElementType(Type pointer) const {
-    return cast<RankedTensorType>(pointer).getElementType();
-  }
-  Attribute getEncoding(Type pointer) const {
-    return cast<RankedTensorType>(pointer).getEncoding();
-  }
-  ArrayRef<int64_t> getShape(Type pointer) const {
-    return cast<RankedTensorType>(pointer).getShape();
-  }
-  int64_t getRank(Type pointer) const {
-    return cast<RankedTensorType>(pointer).getRank();
-  }
-  int64_t getElementTypeBitWidth(Type pointer) const {
-    return cast<RankedTensorType>(pointer).getElementTypeBitWidth();
-  }
-};
-
-struct MemDescModel
-    : public TensorOrMemDesc::ExternalModel<MemDescModel, MemDescType> {
-  Type getElementType(Type pointer) const {
-    return cast<MemDescType>(pointer).getElementType();
-  }
-  Attribute getEncoding(Type pointer) const {
-    return cast<MemDescType>(pointer).getEncoding();
-  }
-  ArrayRef<int64_t> getShape(Type pointer) const {
-    return cast<MemDescType>(pointer).getShape();
-  }
-  int64_t getRank(Type pointer) const {
-    return cast<MemDescType>(pointer).getShape().size();
-  }
-  int64_t getElementTypeBitWidth(Type pointer) const {
-    return cast<MemDescType>(pointer).getElementType().getIntOrFloatBitWidth();
-  }
-};
-
 } // namespace
 
 void TritonDialect::initialize() {
@@ -128,9 +89,6 @@ void TritonDialect::initialize() {
 
   // We can also add interface here.
   addInterfaces<TritonInlinerInterface>();
-
-  RankedTensorType::attachInterface<TensorModel>(*getContext());
-  MemDescType::attachInterface<MemDescModel>(*getContext());
 }
 
 Operation *TritonDialect::materializeConstant(OpBuilder &builder,
diff --git a/lib/Dialect/Triton/IR/Types.cpp b/lib/Dialect/Triton/IR/Types.cpp
index 6e41e70a8e..de8925cbff 100644
--- a/lib/Dialect/Triton/IR/Types.cpp
+++ b/lib/Dialect/Triton/IR/Types.cpp
@@ -50,61 +50,6 @@ void PointerType::print(AsmPrinter &printer) const {
   }
 }
 
-static constexpr llvm::StringRef kMutableMemory = "mutable";
-
-Type MemDescType::parse(AsmParser &parser) {
-  if (parser.parseLess())
-    return Type();
-
-  SmallVector<int64_t> dimensions;
-  if (parser.parseDimensionList(dimensions, /*allowDynamic=*/false))
-    return Type();
-
-  // Parse the element type.
-  Type elementType;
-  if (parser.parseType(elementType))
-    return Type();
-
-  Attribute encoding;
-  if (succeeded(parser.parseOptionalComma())) {
-    if (parser.parseAttribute(encoding))
-      return Type();
-  }
-  bool mutableMemory = false;
-  Attribute memorySpace;
-  if (succeeded(parser.parseOptionalComma())) {
-    if (failed(parser.parseOptionalKeyword(kMutableMemory))) {
-      if (parser.parseAttribute(memorySpace))
-        return Type();
-    } else {
-      mutableMemory = true;
-    }
-  }
-  if (mutableMemory == false && succeeded(parser.parseOptionalComma())) {
-    if (parser.parseOptionalKeyword(kMutableMemory))
-      return Type();
-    mutableMemory = true;
-  }
-  if (parser.parseGreater())
-    return Type();
-  return MemDescType::get(parser.getContext(), dimensions, elementType,
-                          encoding, memorySpace, mutableMemory);
-}
-
-void MemDescType::print(AsmPrinter &printer) const {
-  printer << "<";
-  for (auto dim : getShape())
-    printer << dim << "x";
-  printer << getElementType();
-  if (getEncoding())
-    printer << ", " << getEncoding();
-  if (getMemorySpace())
-    printer << ", " << getMemorySpace();
-  if (getMutableMemory())
-    printer << ", " << kMutableMemory;
-  printer << ">";
-}
-
 namespace mlir {
 
 namespace triton {
diff --git a/lib/Dialect/TritonGPU/IR/CMakeLists.txt b/lib/Dialect/TritonGPU/IR/CMakeLists.txt
index 98831f0db8..7486d72f36 100644
--- a/lib/Dialect/TritonGPU/IR/CMakeLists.txt
+++ b/lib/Dialect/TritonGPU/IR/CMakeLists.txt
@@ -7,6 +7,7 @@ add_triton_library(TritonGPUIR
   DEPENDS
   TritonGPUTableGen
   TritonGPUAttrDefsIncGen
+  TritonGPUTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRGPUDialect
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 5ae07c3378..3273154fc7 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -11,6 +11,7 @@
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/Types.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
@@ -19,6 +20,7 @@
 
 // Include TableGen'erated code
 #include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc"
+#include "triton/Dialect/TritonGPU/IR/TypeInterfaces.cpp.inc"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -730,10 +732,10 @@ static void maybePrintCTALayout(mlir::MLIRContext *context,
 //===----------------------------------------------------------------------===//
 // Attribute methods
 //===----------------------------------------------------------------------===//
-#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.cpp.inc"
+#include "triton/Dialect/TritonGPU/IR/AttrInterfaces.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc"
+#include "triton/Dialect/TritonGPU/IR/AttrDefs.cpp.inc"
 
 SliceEncodingAttr BlockedEncodingAttr::squeeze(int axis) {
   return SliceEncodingAttr::get(getContext(), axis, *this);
@@ -3660,12 +3662,52 @@ void mlir::triton::gpu::dumpHWLayout(RankedTensorType tensorType) {
   llvm::errs() << getLayoutStr(tensorType, /*useHWPointOfView=*/true);
 }
 
+struct TensorModel
+    : public triton::gpu::TensorOrMemDesc::ExternalModel<TensorModel,
+                                                         RankedTensorType> {
+  Type getElementType(Type pointer) const {
+    return cast<RankedTensorType>(pointer).getElementType();
+  }
+  Attribute getEncoding(Type pointer) const {
+    return cast<RankedTensorType>(pointer).getEncoding();
+  }
+  ArrayRef<int64_t> getShape(Type pointer) const {
+    return cast<RankedTensorType>(pointer).getShape();
+  }
+  int64_t getRank(Type pointer) const {
+    return cast<RankedTensorType>(pointer).getRank();
+  }
+  int64_t getElementTypeBitWidth(Type pointer) const {
+    return cast<RankedTensorType>(pointer).getElementTypeBitWidth();
+  }
+};
+
+struct MemDescModel
+    : public triton::gpu::TensorOrMemDesc::ExternalModel<MemDescModel,
+                                                         MemDescType> {
+  Type getElementType(Type pointer) const {
+    return cast<MemDescType>(pointer).getElementType();
+  }
+  Attribute getEncoding(Type pointer) const {
+    return cast<MemDescType>(pointer).getEncoding();
+  }
+  ArrayRef<int64_t> getShape(Type pointer) const {
+    return cast<MemDescType>(pointer).getShape();
+  }
+  int64_t getRank(Type pointer) const {
+    return cast<MemDescType>(pointer).getShape().size();
+  }
+  int64_t getElementTypeBitWidth(Type pointer) const {
+    return cast<MemDescType>(pointer).getElementType().getIntOrFloatBitWidth();
+  }
+};
+
 void TritonGPUDialect::initialize() {
   registerTypes();
 
   addAttributes<
 #define GET_ATTRDEF_LIST
-#include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc"
+#include "triton/Dialect/TritonGPU/IR/AttrDefs.cpp.inc"
       >();
   addOperations<
 #define GET_OP_LIST
@@ -3674,6 +3716,9 @@ void TritonGPUDialect::initialize() {
       >();
   addInterfaces<TritonGPUOpAsmInterface>();
   addInterfaces<TritonGPUInferLayoutInterface>();
+
+  RankedTensorType::attachInterface<TensorModel>(*getContext());
+  MemDescType::attachInterface<MemDescModel>(*getContext());
 }
 
 // verify TritonGPU ops
diff --git a/lib/Dialect/TritonGPU/IR/Types.cpp b/lib/Dialect/TritonGPU/IR/Types.cpp
index 77f673cc27..fe87626203 100644
--- a/lib/Dialect/TritonGPU/IR/Types.cpp
+++ b/lib/Dialect/TritonGPU/IR/Types.cpp
@@ -27,6 +27,61 @@ void TokenType::print(AsmPrinter &printer) const {
   printer << "<" << getType() << ">";
 }
 
+static constexpr llvm::StringRef kMutableMemory = "mutable";
+
+Type MemDescType::parse(AsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  SmallVector<int64_t> dimensions;
+  if (parser.parseDimensionList(dimensions, /*allowDynamic=*/false))
+    return Type();
+
+  // Parse the element type.
+  Type elementType;
+  if (parser.parseType(elementType))
+    return Type();
+
+  Attribute encoding;
+  if (succeeded(parser.parseOptionalComma())) {
+    if (parser.parseAttribute(encoding))
+      return Type();
+  }
+  bool mutableMemory = false;
+  Attribute memorySpace;
+  if (succeeded(parser.parseOptionalComma())) {
+    if (failed(parser.parseOptionalKeyword(kMutableMemory))) {
+      if (parser.parseAttribute(memorySpace))
+        return Type();
+    } else {
+      mutableMemory = true;
+    }
+  }
+  if (mutableMemory == false && succeeded(parser.parseOptionalComma())) {
+    if (parser.parseOptionalKeyword(kMutableMemory))
+      return Type();
+    mutableMemory = true;
+  }
+  if (parser.parseGreater())
+    return Type();
+  return MemDescType::get(parser.getContext(), dimensions, elementType,
+                          encoding, memorySpace, mutableMemory);
+}
+
+void MemDescType::print(AsmPrinter &printer) const {
+  printer << "<";
+  for (auto dim : getShape())
+    printer << dim << "x";
+  printer << getElementType();
+  if (getEncoding())
+    printer << ", " << getEncoding();
+  if (getMemorySpace())
+    printer << ", " << getMemorySpace();
+  if (getMutableMemory())
+    printer << ", " << kMutableMemory;
+  printer << ">";
+}
+
 //===----------------------------------------------------------------------===//
 // Triton Dialect
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
index 0d7bd5bdc2..5ffddee023 100644
--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -121,7 +121,7 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
   Value src = loadOp.getPtr();
   Value mask = loadOp.getMask();
   Value other = loadOp.getOther();
-  tt::MemDescType allocTy = cast<tt::MemDescType>(alloc.getType());
+  ttg::MemDescType allocTy = cast<ttg::MemDescType>(alloc.getType());
 
   auto convertBlockLayout = [&](Value src, ttg::BlockedEncodingAttr enc) {
     auto ty = cast<RankedTensorType>(src.getType());
@@ -146,7 +146,7 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
   copyOffsets[0] = insertIdx;
   Attribute sharedMemorySpace =
       triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext());
-  tt::MemDescType subviewTy = tt::MemDescType::get(
+  ttg::MemDescType subviewTy = ttg::MemDescType::get(
       allocTy.getShape().drop_front(), allocTy.getElementType(),
       allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true);
   auto view = builder.createWithStage<ttg::MemDescSubviewOp>(
@@ -168,13 +168,13 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
       loc, stageForFirstUse, clusterForFirstUse, subviewTy, alloc, loadOffsets);
   if (loadIsMMAv3Shared) {
     auto alloc = cast<ttg::LocalAllocOp>((*loadOp->getUsers().begin()));
-    replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
+    tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
     alloc.erase();
   } else {
     SmallVector<ttg::LocalAllocOp> allocsToErase;
     for (Operation *user : loadOp->getUsers()) {
       if (auto alloc = dyn_cast<ttg::LocalAllocOp>(user)) {
-        replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
+        tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
         allocsToErase.push_back(alloc);
       }
     }
@@ -227,15 +227,15 @@ createTMAAsyncCopy(scf::ForOp &forOp, tt::ExperimentalDescriptorLoadOp loadOp,
   auto [stageForFirstUse, clusterForFirstUse] = tt::getStageCluster(firstUse);
 
   Attribute sharedMemorySpace =
-      triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext());
+      ttg::SharedMemorySpaceAttr::get(forOp.getContext());
   Value zero = builder.createWithStage<arith::ConstantIntOp>(
       forOp.getLoc(), stage, clusterId, 0, 32);
   builder.setInsertionPoint(loadOp);
   Location loc = loadOp.getLoc();
-  tt::MemDescType allocTy = cast<tt::MemDescType>(alloc.getType());
+  ttg::MemDescType allocTy = cast<ttg::MemDescType>(alloc.getType());
   SmallVector<Value> copyOffsets(allocTy.getRank(), zero);
   copyOffsets[0] = insertIdx;
-  tt::MemDescType subviewTy = tt::MemDescType::get(
+  ttg::MemDescType subviewTy = ttg::MemDescType::get(
       allocTy.getShape().drop_front(), allocTy.getElementType(),
       allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true);
   auto view = builder.createWithStage<ttg::MemDescSubviewOp>(
@@ -259,13 +259,13 @@ createTMAAsyncCopy(scf::ForOp &forOp, tt::ExperimentalDescriptorLoadOp loadOp,
       loc, stageForFirstUse, clusterForFirstUse, subviewTy, alloc, loadOffsets);
   if (loadIsMMAv3Shared) {
     auto alloc = cast<ttg::LocalAllocOp>((*loadOp->getUsers().begin()));
-    replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
+    tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
     alloc.erase();
   } else {
     SmallVector<ttg::LocalAllocOp> allocsToErase;
     for (Operation *user : loadOp->getUsers()) {
       if (auto alloc = dyn_cast<ttg::LocalAllocOp>(user)) {
-        replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
+        tt::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
         allocsToErase.push_back(alloc);
       }
     }
@@ -525,15 +525,15 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp,
                          ttg::SharedEncodingAttr sharedEnc, unsigned distance) {
   OpBuilder builder(forOp);
   Attribute sharedMemorySpace =
-      triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext());
+      ttg::SharedMemorySpaceAttr::get(forOp.getContext());
   auto ty = cast<RankedTensorType>(loadOp->getResultTypes()[0]);
   SmallVector<int64_t> bufferShape(ty.getShape().begin(), ty.getShape().end());
   bufferShape.insert(bufferShape.begin(), distance);
-  Type memdescType = mlir::triton::MemDescType::get(
-      bufferShape, ty.getElementType(), sharedEnc, sharedMemorySpace,
-      /*mutableMemory*/ true);
-  Value alloc = builder.create<mlir::triton::gpu::LocalAllocOp>(
-      loadOp->getLoc(), memdescType, Value());
+  Type memdescType = ttg::MemDescType::get(bufferShape, ty.getElementType(),
+                                           sharedEnc, sharedMemorySpace,
+                                           /*mutableMemory*/ true);
+  Value alloc =
+      builder.create<ttg::LocalAllocOp>(loadOp->getLoc(), memdescType, Value());
   return alloc;
 }
 
@@ -541,7 +541,7 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp,
 static Value createBarrierAlloc(scf::ForOp &forOp, unsigned distance) {
   OpBuilder builder(forOp);
   Attribute sharedMemorySpace =
-      triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext());
+      ttg::SharedMemorySpaceAttr::get(forOp.getContext());
   Location loc = forOp.getLoc();
   auto context = forOp.getContext();
   auto barrierCTALayout =
@@ -549,14 +549,14 @@ static Value createBarrierAlloc(scf::ForOp &forOp, unsigned distance) {
                               /*CTASplitNum=*/{1}, /*CTAOrder=*/{0});
   auto barrierEncoding =
       ttg::SharedEncodingAttr::get(context, 1, 1, 1, {0}, barrierCTALayout);
-  Type barrierMemDescType = tt::MemDescType::get(
+  Type barrierMemDescType = ttg::MemDescType::get(
       {distance}, builder.getI64Type(), barrierEncoding, sharedMemorySpace,
       /*mutableMemory=*/true);
   Type singleBarrierMemDescType =
-      tt::MemDescType::get({1}, builder.getI64Type(), barrierEncoding,
-                           sharedMemorySpace, /*mutableMemory=*/true);
-  Value barrierAlloc = builder.create<mlir::triton::gpu::LocalAllocOp>(
-      loc, barrierMemDescType, Value());
+      ttg::MemDescType::get({1}, builder.getI64Type(), barrierEncoding,
+                            sharedMemorySpace, /*mutableMemory=*/true);
+  Value barrierAlloc =
+      builder.create<ttg::LocalAllocOp>(loc, barrierMemDescType, Value());
   for (unsigned i = 0; i < distance; i++) {
     Value idx = builder.create<arith::ConstantIntOp>(loc, i, 32);
     Value barrierView = builder.create<ttg::MemDescSubviewOp>(
@@ -654,10 +654,10 @@ static void createTMABarrierAndWait(
     Location loc = forOp.getLoc();
     OpBuilderWithStage builder(forOp);
     Attribute sharedMemorySpace =
-        triton::gpu::SharedMemorySpaceAttr::get(builder.getContext());
-    tt::MemDescType barrierTy = tt::MemDescType::get(
+        ttg::SharedMemorySpaceAttr::get(builder.getContext());
+    ttg::MemDescType barrierTy = ttg::MemDescType::get(
         {1}, builder.getI64Type(),
-        cast<tt::MemDescType>(barrierAlloc.getType()).getEncoding(),
+        cast<ttg::MemDescType>(barrierAlloc.getType()).getEncoding(),
         sharedMemorySpace,
         /*mutableMemory=*/true);
     builder.setInsertionPoint(group[0]->loadOp);
@@ -838,14 +838,14 @@ createAsyncOps(scf::ForOp &forOp,
 static void invalidateBarriers(OpBuilder &builder,
                                SmallVector<Value> &barriers) {
   Attribute sharedMemorySpace =
-      triton::gpu::SharedMemorySpaceAttr::get(builder.getContext());
+      ttg::SharedMemorySpaceAttr::get(builder.getContext());
   for (Value barrier : barriers) {
-    int numBarriers = cast<tt::MemDescType>(barrier.getType()).getShape()[0];
+    int numBarriers = cast<ttg::MemDescType>(barrier.getType()).getShape()[0];
     for (int i = 0; i < numBarriers; i++) {
       Value idx = builder.create<arith::ConstantIntOp>(barrier.getLoc(), i, 32);
-      tt::MemDescType barrierTy = tt::MemDescType::get(
+      ttg::MemDescType barrierTy = ttg::MemDescType::get(
           {1}, builder.getI64Type(),
-          cast<tt::MemDescType>(barrier.getType()).getEncoding(),
+          cast<ttg::MemDescType>(barrier.getType()).getEncoding(),
           sharedMemorySpace,
           /*mutableMemory=*/true);
       Value barrierView = builder.create<ttg::MemDescSubviewOp>(
@@ -1092,7 +1092,7 @@ static void threadValuesThroughWait(ttng::WarpGroupDotWaitOp wait,
 
   for (ttng::WarpGroupDotOp dot : asyncDots) {
     for (Value operand : dot.getOperands()) {
-      if (isa<tt::MemDescType>(operand.getType())) {
+      if (isa<ttg::MemDescType>(operand.getType())) {
         newOperands.insert(operand);
       }
     }
@@ -1110,12 +1110,12 @@ static void threadValuesThroughWait(ttng::WarpGroupDotWaitOp wait,
   };
   for (int i = 0; i < origNumOperands; i++) {
     Value operand = wait.getResult(i);
-    if (!isa<tt::MemDescType>(operand.getType()))
+    if (!isa<ttg::MemDescType>(operand.getType()))
       operand.replaceAllUsesWith(newWait.getResult(i));
   }
   for (int i = origNumOperands; i < newOperands.size(); i++) {
     Value operand = newWait.getOperand(i);
-    if (!isa<tt::MemDescType>(operand.getType()))
+    if (!isa<ttg::MemDescType>(operand.getType()))
       operand.replaceUsesWithIf(newWait.getResult(i), dominatedByNewWait);
   }
   wait->erase();
@@ -1178,7 +1178,7 @@ static std::optional<int> dotCanBeProperlyAsync(ttng::WarpGroupDotOp dotOp,
   // Rule 1: All shmem operands are multi-buffered.
   auto checkOperand = [&](Value operand) {
     if (!isa<ttg::SharedEncodingAttr>(
-            cast<TensorOrMemDesc>(operand.getType()).getEncoding())) {
+            cast<ttg::TensorOrMemDesc>(operand.getType()).getEncoding())) {
       // Rule 1a: Register operands must not be modified within the loop.
       // First, check for chained WGMMA as an exception.
       if (auto cvt = dyn_cast<ttg::ConvertLayoutOp>(operand.getDefiningOp())) {
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
index f90c6b7475..29e75ee535 100644
--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -147,10 +147,10 @@ void mlir::triton::replaceUsesAndPropagateType(OpBuilder &builder,
     builder.setInsertionPoint(user);
     Value newVal;
     if (auto subview = dyn_cast<triton::gpu::MemDescSubviewOp>(user)) {
-      triton::MemDescType oldType = subview.getType();
+      triton::gpu::MemDescType oldType = subview.getType();
       bool isMutable =
-          cast<triton::MemDescType>(val.getType()).getMutableMemory();
-      Type newDstType = triton::MemDescType::get(
+          cast<triton::gpu::MemDescType>(val.getType()).getMutableMemory();
+      Type newDstType = triton::gpu::MemDescType::get(
           oldType.getShape(), oldType.getElementType(), oldType.getEncoding(),
           oldType.getMemorySpace(), isMutable);
       newVal = builder.create<triton::gpu::MemDescSubviewOp>(
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp
index 1cc3df7ec3..b24ac95387 100644
--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp
@@ -42,8 +42,8 @@ static Value createAlloc(scf::ForOp &forOp,
   Attribute sharedMemorySpace =
       triton::gpu::SharedMemorySpaceAttr::get(ty.getContext());
   Type memdescType =
-      tt::MemDescType::get(ty.getShape(), ty.getElementType(), encoding,
-                           sharedMemorySpace, /*mutableMemory*/ true);
+      ttg::MemDescType::get(ty.getShape(), ty.getElementType(), encoding,
+                            sharedMemorySpace, /*mutableMemory*/ true);
   Value alloc = builder.create<ttg::LocalAllocOp>(storeOp->getLoc(),
                                                   memdescType, Value());
   return alloc;
diff --git a/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp b/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp
index 2cbc00142b..46a55d550d 100644
--- a/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp
@@ -114,7 +114,7 @@ Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue,
                                    std::optional<int64_t> offsetK,
                                    std::optional<int64_t> shapeK) {
   // opIdx: 0 => a, 1 => b
-  auto type = cast<triton::MemDescType>(v.getType());
+  auto type = cast<triton::gpu::MemDescType>(v.getType());
   SmallVector<int64_t> shape{type.getShape().begin(), type.getShape().end()};
   SmallVector<int64_t> offset{0, 0};
   Type elementType = type.getElementType();
@@ -136,8 +136,8 @@ Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue,
         builder.create<arith::ConstantIntOp>(v.getLoc(), off, 32));
   Value newSmem = builder.create<triton::gpu::MemDescSubviewOp>(
       v.getLoc(),
-      triton::MemDescType::get(shape, elementType, type.getEncoding(),
-                               type.getMemorySpace()),
+      triton::gpu::MemDescType::get(shape, elementType, type.getEncoding(),
+                                    type.getMemorySpace()),
       v, offsetsVal);
 
   auto dotOperandEnc = triton::gpu::DotOperandEncodingAttr::get(
diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
index b1e296c1bb..af756c6d83 100644
--- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
@@ -58,7 +58,7 @@ class TritonGPUReduceDataDuplicationPass
       }
       auto sharedMemorySpace =
           triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext());
-      auto tmpType = triton::MemDescType::get(
+      auto tmpType = triton::gpu::MemDescType::get(
           dstType.getShape(), dstType.getElementType(),
           triton::gpu::SharedEncodingAttr::get(
               mod.getContext(), dstDotOp, srcType.getShape(), sharedOrder,
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index fa8ec2b926..b8f3abfcac 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -944,7 +944,7 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) {
     if (user->getNumResults() != 1)
       return std::nullopt;
     if (auto memDesc =
-            dyn_cast<triton::MemDescType>(user->getResult(0).getType())) {
+            dyn_cast<triton::gpu::MemDescType>(user->getResult(0).getType())) {
       // First time we find a shared encoding in the chain, save it and try to
       // use it if it is compatible with the other users.
       tempAttr = cast<ttg::SharedEncodingAttr>(memDesc.getEncoding());
@@ -955,10 +955,11 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
         return std::nullopt;
       auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
+          cast<triton::gpu::TensorOrMemDesc>(user->getResult(0).getType())
+              .getEncoding());
       if (!dotOpEnc)
         return std::nullopt;
-      auto srcTy = cast<TensorOrMemDesc>(val.getType());
+      auto srcTy = cast<triton::gpu::TensorOrMemDesc>(val.getType());
       auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
       auto order = ttg::getOrder(srcTy.getEncoding());
       unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
index 92d9b589a2..942eb5423d 100644
--- a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
+++ b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -42,8 +42,10 @@ mlir::LogicalResult WarpGroupDotOp::inferReturnTypes(
   inferredReturnTypes.push_back(accTy);
 
   // verify encodings
-  auto aEnc = cast<TensorOrMemDesc>(operands[0].getType()).getEncoding();
-  auto bEnc = cast<TensorOrMemDesc>(operands[1].getType()).getEncoding();
+  auto aEnc =
+      cast<triton::gpu::TensorOrMemDesc>(operands[0].getType()).getEncoding();
+  auto bEnc =
+      cast<triton::gpu::TensorOrMemDesc>(operands[1].getType()).getEncoding();
   auto retEnc = accTy.getEncoding();
   if (aEnc) {
     assert(bEnc);
@@ -62,10 +64,10 @@ void WarpGroupDotOp::getEffects(
         &effects) {
   auto &a = getAMutable();
   auto &b = getBMutable();
-  if (isa<MemDescType>(a.get().getType()))
+  if (isa<mlir::triton::gpu::MemDescType>(a.get().getType()))
     effects.emplace_back(MemoryEffects::Read::get(), &a,
                          mlir::triton::gpu::SharedMemory::get());
-  if (isa<MemDescType>(b.get().getType()))
+  if (isa<mlir::triton::gpu::MemDescType>(b.get().getType()))
     effects.emplace_back(MemoryEffects::Read::get(), &b,
                          mlir::triton::gpu::SharedMemory::get());
 }
@@ -73,11 +75,12 @@ void WarpGroupDotOp::getEffects(
 bool WarpGroupDotOp::needsPartialAccumulator() {
   const auto &a = getA();
   const auto &d = getD();
-  auto aTensorTy = cast<TensorOrMemDesc>(a.getType());
-  auto aElTy = cast<TensorOrMemDesc>(a.getType()).getElementType();
+  auto aTensorTy = cast<triton::gpu::TensorOrMemDesc>(a.getType());
+  auto aElTy = cast<triton::gpu::TensorOrMemDesc>(a.getType()).getElementType();
   bool isFP8 = aElTy.isFloat8E5M2() || aElTy.isFloat8E4M3FN() ||
                aElTy.isFloat8E5M2FNUZ() || aElTy.isFloat8E4M3FNUZ();
-  bool accFP32 = cast<TensorOrMemDesc>(d.getType()).getElementType().isF32();
+  bool accFP32 =
+      cast<triton::gpu::TensorOrMemDesc>(d.getType()).getElementType().isF32();
   uint32_t maxNumImpreciseAcc = getMaxNumImpreciseAcc();
   return isFP8 && accFP32 && maxNumImpreciseAcc <= aTensorTy.getShape()[1];
 }
@@ -93,7 +96,8 @@ LogicalResult WarpGroupDotWaitOp::inferReturnTypes(
   return mlir::success();
 }
 
-static LogicalResult verifyBarrierType(Operation *op, MemDescType barrierType) {
+static LogicalResult
+verifyBarrierType(Operation *op, mlir::triton::gpu::MemDescType barrierType) {
   if (!barrierType.getElementType().isInteger(64) ||
       barrierType.getShape() != ArrayRef<int64_t>({1}))
     return op->emitOpError(
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index e9ce077184..23c598ee16 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -5333,10 +5333,10 @@ def test_convert2d(M, N, src_layout, interm_layout, dst_layout, dtype, device, t
     %12 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst>
     %13 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst>
     """ if interm_layout is None else f"""
-    %15 = triton_gpu.local_alloc %9 : (tensor<{M}x{N}xi32, #src>) -> !tt.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory>
-    %16 = triton_gpu.local_load %15 : !tt.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xi32, #src>
-    %17 = triton_gpu.local_alloc %11 : (tensor<{M}x{N}xf16, #src>) -> !tt.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory>
-    %18 = triton_gpu.local_load %17 : !tt.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xf16, #src>
+    %15 = triton_gpu.local_alloc %9 : (tensor<{M}x{N}xi32, #src>) -> !triton_gpu.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory>
+    %16 = triton_gpu.local_load %15 : !triton_gpu.memdesc<{M}x{N}xi32, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xi32, #src>
+    %17 = triton_gpu.local_alloc %11 : (tensor<{M}x{N}xf16, #src>) -> !triton_gpu.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory>
+    %18 = triton_gpu.local_load %17 : !triton_gpu.memdesc<{M}x{N}xf16, #interm, #triton_gpu.shared_memory> -> tensor<{M}x{N}xf16, #src>
 
     %12 = triton_gpu.convert_layout %16 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst>
     %13 = triton_gpu.convert_layout %18 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst>
diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir
index 109395ae04..e67e55fb1c 100644
--- a/test/Analysis/test-alias.mlir
+++ b/test/Analysis/test-alias.mlir
@@ -41,7 +41,7 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
 // CHECK-LABEL: alloc
 tt.func @alloc(%A : !tt.ptr<f16>) {
   // CHECK: %0 -> %0
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
 }
 
@@ -49,40 +49,40 @@ tt.func @alloc(%A : !tt.ptr<f16>) {
 tt.func @alloc_init(%A : !tt.ptr<f16>) {
   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
   // CHECK: %0 -> %0
-  %cst1 = triton_gpu.local_alloc %cst0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst1 = triton_gpu.local_alloc %cst0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   tt.return
 }
 
 // CHECK-LABEL: trans
 tt.func @trans(%A : !tt.ptr<f16>) {
   // CHECK: %0 -> %0
-  %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: %1 -> %0
-  %b = triton_gpu.memdesc_trans %tensor {order=array<i32: 1,0>} : !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable>
+  %b = triton_gpu.memdesc_trans %tensor {order=array<i32: 1,0>} : !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable>
   tt.return
 }
 
 // CHECK-LABEL: subview
-tt.func @subview(%A : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+tt.func @subview(%A : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) {
   %index = arith.constant 0 : i32
   // CHECK: %0 -> %0
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %1 -> %0
-  %cst1 = triton_gpu.memdesc_subview %a[%index, %index, %index] : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.memdesc_subview %a[%index, %index, %index] : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
 }
 
 // CHECK-LABEL: if_alias
 tt.func @if_alias(%i1 : i1) {
   // CHECK: %0 -> %0
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: %1 -> %1
-  %b = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %2 -> %0,%1
-  %cst2 = scf.if %i1 -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> {
-    scf.yield %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = scf.if %i1 -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> {
+    scf.yield %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   } else {
-    scf.yield %b : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    scf.yield %b : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
 }
@@ -90,11 +90,11 @@ tt.func @if_alias(%i1 : i1) {
 // CHECK-LABEL: for
 tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   // CHECK: %0 -> %0
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: %1 -> %1
-  %b = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: %2 -> %2
-  %c = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %c = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %arg6 -> %0
   // CHECK-NEXT: %arg7 -> %1
   // CHECK-NEXT: %arg8 -> %2
@@ -102,8 +102,8 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !t
   // CHECK-NEXT: %3#1 -> %0,%1
   // CHECK-NEXT: %3#2 -> %0,%1,%2
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a, %b_shared = %b, %c_shared = %c) ->
-  (!tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
-    scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  (!triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+    scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
 }
@@ -111,11 +111,11 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !t
 // CHECK-LABEL: for_if
 tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   // CHECK: %0 -> %0
-  %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %1 -> %1
-  %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %2 -> %2
-  %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %arg7 -> %0
   // CHECK-NEXT: %arg8 -> %1
   // CHECK-NEXT: %arg9 -> %2
@@ -123,14 +123,14 @@ tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B :
   // CHECK-NEXT: %3#1 -> %0,%1
   // CHECK-NEXT: %3#2 -> %0,%1,%2
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) ->
-  (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+  (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
     scf.if %i1 {
       %index = arith.constant 8 : i32
       // CHECK-NEXT: %4 -> %0,%1
-      %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+      %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
       scf.yield
     }
-    scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
 }
@@ -138,11 +138,11 @@ tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B :
 // CHECK-LABEL: for_for_if
 tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   // CHECK: %0 -> %0
-  %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %1 -> %1
-  %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %2 -> %2
-  %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %arg7 -> %0
   // CHECK-NEXT: %arg8 -> %1
   // CHECK-NEXT: %arg9 -> %2
@@ -150,23 +150,23 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
   // CHECK-NEXT: %3#1 -> %1
   // CHECK-NEXT: %3#2 -> %2,%6,%6
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) ->
-  (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+  (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
     // CHECK-NEXT: %arg11 -> %2,%6,%6
     // CHECK-NEXT: %4 -> %2,%6,%6
-    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
       // CHECK-NEXT: %5 -> %6,%6
-      %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> {
+      %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> {
         // CHECK-NEXT: %6 -> %6
-        %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-        scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
       } else {
         // CHECK-NEXT: %6 -> %6
-        %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-        scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
       }
-      scf.yield %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+      scf.yield %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     }
-    scf.yield %a_shared, %b_shared, %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    scf.yield %a_shared, %b_shared, %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
 }
@@ -175,29 +175,29 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
 tt.func @cf_for(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr<f16>, %arg4: !tt.ptr<f16>) {
   %idx = arith.constant 0 : i32
   // CHECK: %0 -> %0
-  %cst = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %1 -> %1
-  %cst_0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %2 -> %0
-  %0 = triton_gpu.memdesc_subview %cst[%idx, %idx] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %0 = triton_gpu.memdesc_subview %cst[%idx, %idx] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   gpu.barrier
   // CHECK-NEXT: %3 -> %3
-  %cst_1 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: %5 -> %0,%1,%3
   // CHECK-NEXT: %6 -> %0,%1,%3
   // CHECK-NEXT: %7 -> %0,%1,%3
-  cf.br ^bb1(%arg0, %cst, %cst_0, %cst_1 : index, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>)
-^bb1(%1: index, %2: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %3: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %4: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>):  // 2 preds: ^bb0, ^bb2
+  cf.br ^bb1(%arg0, %cst, %cst_0, %cst_1 : index, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>)
+^bb1(%1: index, %2: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %3: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, %4: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>):  // 2 preds: ^bb0, ^bb2
   %5 = arith.cmpi slt, %1, %arg1 : index
   cf.cond_br %5, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
   gpu.barrier
   %8 = arith.addi %1, %arg2 : index
-  cf.br ^bb1(%8, %4, %2, %3 : index, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>)
+  cf.br ^bb1(%8, %4, %2, %3 : index, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>)
 ^bb3:  // pred: ^bb1
   gpu.barrier
   // CHECK-NEXT: %10 -> %0
-  %9 = triton_gpu.memdesc_subview %0[%idx, %idx] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %9 = triton_gpu.memdesc_subview %0[%idx, %idx] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
 }
 
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
index db2e2947b8..fe4da43ca9 100644
--- a/test/Analysis/test-allocation.mlir
+++ b/test/Analysis/test-allocation.mlir
@@ -95,47 +95,47 @@ tt.func @reusable(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: preallocate
 tt.func @preallocate(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 512
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 3072, size = 1024
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 4096, size = 1024
-  %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
 
-  triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 0, size = 1024
-  %c = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %c = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
 
-  triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
 
   // CHECK-NEXT: offset = 1024, size = 1024
-  %cst4 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 6144, size = 2048
-  %e = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %a : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %e = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %a : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 8192, size = 2048
-  %d = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %b : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %d = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %b : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 10240, size = 2048
-  %f = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst4 : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %c : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %f = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst4 : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %c : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 0, size = 2048
-  %cst5 = triton_gpu.local_alloc : () -> !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst5 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 4096
-  %g = triton_gpu.local_alloc : () -> !tt.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %e : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %g = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %e : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 4096
-  %h = triton_gpu.local_alloc : () -> !tt.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %d : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %h = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %d : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 4096
-  %i = triton_gpu.local_alloc : () -> !tt.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %f : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst5 : !tt.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %i = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %f : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst5 : !triton_gpu.memdesc<64x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 12288
 }
@@ -145,11 +145,11 @@ tt.func @preallocate(%A : !tt.ptr<f16>) {
 tt.func @unused(%A : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL>
   // CHECK: offset = 0, size = 1024
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<32x16xf16, #AL>) -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<32x16xf16, #AL>) -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK-NEXT: offset = 0, size = 512
-  %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 0, size = 512
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK: size = 1024
 }
@@ -158,33 +158,33 @@ tt.func @unused(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: longlive
 tt.func @longlive(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 512
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 3072, size = 1024
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
 
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst3 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 512
-  %cst4 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 3072, size = 1024
-  %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 3072, size = 512
-  %cst5 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst5 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 3072, size = 512
-  %cst6 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst6 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 3072, size = 1024
-  %c = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst3 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst4 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %c = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst3 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst4 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 1024
-  %d = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %d = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 4096
 }
@@ -193,43 +193,43 @@ tt.func @longlive(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: multi_color
 tt.func @multi_color(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 64
-  %cst = triton_gpu.local_alloc : () -> !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1536, size = 32
-  %cst_0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1664, size = 128
-  %cst_1 = triton_gpu.local_alloc : () -> !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   // CHECK-NEXT: scratch offset = 128, size = 1152
   %0 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
-  %1 = triton_gpu.local_load %cst : !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL>
+  %1 = triton_gpu.local_load %cst : !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL>
   // CHECK-NEXT: offset = 0, size = 128
-  %cst_3 = triton_gpu.local_alloc : () -> !tt.memdesc<4x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %2 = triton_gpu.local_load %cst_0 : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
+  %cst_3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %2 = triton_gpu.local_load %cst_0 : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
   // CHECK-NEXT: scratch offset = 0, size = 1152
   %3 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   // CHECK-NEXT: offset = 0, size = 256
-  %cst_4 = triton_gpu.local_alloc : () -> !tt.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 256, size = 64
-  %cst_5 = triton_gpu.local_alloc : () -> !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %4 = triton_gpu.local_load %cst_5 : !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL>
-  %5 = triton_gpu.local_load %cst_5 : !tt.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL>
+  %cst_5 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %4 = triton_gpu.local_load %cst_5 : !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL>
+  %5 = triton_gpu.local_load %cst_5 : !triton_gpu.memdesc<4x8xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x8xf16, #AL>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst_6 = triton_gpu.local_alloc : () -> !tt.memdesc<8x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_6 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<8x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1792, size = 128
-  %cst_7 = triton_gpu.local_alloc : () -> !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %6 = triton_gpu.local_load %cst_0 : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
+  %cst_7 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %6 = triton_gpu.local_load %cst_0 : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst_8 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_8 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 256, size = 32
-  %cst_9 = triton_gpu.local_alloc : () -> !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_9 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst_10 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %7 = triton_gpu.local_load %cst_1 : !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL>
-  %8 = triton_gpu.local_load %cst_4 : !tt.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x32xf16, #AL>
+  %cst_10 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %7 = triton_gpu.local_load %cst_1 : !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL>
+  %8 = triton_gpu.local_load %cst_4 : !triton_gpu.memdesc<4x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x32xf16, #AL>
   // CHECK-NEXT: scratch offset = 0, size = 1152
   %9 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #AL>
-  %10 = triton_gpu.local_load %cst_7 : !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL>
+  %10 = triton_gpu.local_load %cst_7 : !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL>
   %cst_12 = arith.constant dense<0.000000e+00> : tensor<4x16xf16, #AL>
   %cst_13 = arith.constant dense<0.000000e+00> : tensor<8x32xf16, #AL>
   // CHECK-NEXT: size = 1920
@@ -240,25 +240,25 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: multi_color_multi_rounds
 tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 32
-  %cst = triton_gpu.local_alloc : () -> !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1280, size = 128
-  %cst_0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 8192
-  %cst_1 = triton_gpu.local_alloc : () -> !tt.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst_1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   // CHECK-NEXT: scratch offset = 128, size = 1152
   %0 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
-  %1 = triton_gpu.local_load %cst : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
+  %1 = triton_gpu.local_load %cst : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
   // CHECK-NEXT: offset = 1152, size = 128
-  %cst_3 = triton_gpu.local_alloc : () -> !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %2 = triton_gpu.local_load %cst : !tt.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
+  %cst_3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %2 = triton_gpu.local_load %cst : !triton_gpu.memdesc<4x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<4x4xf16, #AL>
   // CHECK-NEXT: offset = 0, size = 512
-  %cst_4 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %3 = triton_gpu.local_load %cst_0 : !tt.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL>
-  %4 = triton_gpu.local_load %cst_1 : !tt.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<1024x4xf16, #AL>
+  %cst_4 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %3 = triton_gpu.local_load %cst_0 : !triton_gpu.memdesc<16x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x4xf16, #AL>
+  %4 = triton_gpu.local_load %cst_1 : !triton_gpu.memdesc<1024x4xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<1024x4xf16, #AL>
   // CHECK-NEXT: scratch offset = 0, size = 1152
   %5 = triton_gpu.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
-  %6 = triton_gpu.local_load %cst_3 : !tt.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL>
+  %6 = triton_gpu.local_load %cst_3 : !triton_gpu.memdesc<2x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<2x32xf16, #AL>
   // CHECK-NEXT: size = 10240
   tt.return
 }
@@ -267,10 +267,10 @@ tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
 // CHECK-LABEL: alloc
 tt.func @alloc(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   // CHECK-NEXT: offset = 0, size = 512
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 512
 }
@@ -279,10 +279,10 @@ tt.func @alloc(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: dealloc
 tt.func @dealloc(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 1024
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: offset = 1024, size = 1024
-  %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst0 : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 2048
 }
@@ -303,8 +303,8 @@ tt.func @scratch() {
 // CHECK-LABEL: trans
 tt.func @trans(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 1024
-  %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %b = triton_gpu.memdesc_trans %tensor {order=array<i32: 1,0>} : !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable>
+  %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b = triton_gpu.memdesc_trans %tensor {order=array<i32: 1,0>} : !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable>
   tt.return
 }
 
@@ -312,9 +312,9 @@ tt.func @trans(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: extract_slice
 tt.func @extract_slice(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   %index = arith.constant 0 : i32
-  %cst1 = triton_gpu.memdesc_subview %cst0[%index, %index, %index] : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.memdesc_subview %cst0[%index, %index, %index] : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 512
 }
@@ -326,9 +326,9 @@ tt.func @atomic_scalar(%arg3: !tt.ptr<i32>) -> i32 {
   // CHECK: size = 8196
   %c0_i32 = arith.constant 0 : i32
   %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return %4 : i32
 }
 
@@ -338,9 +338,9 @@ tt.func @atomic_scalar_no_use(%arg3: !tt.ptr<i32>) {
   // CHECK: size = 8192
   %c0_i32 = arith.constant 0 : i32
   %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
@@ -349,25 +349,25 @@ tt.func @atomic_scalar_no_use(%arg3: !tt.ptr<i32>) {
 // CHECK-LABEL: if
 tt.func @if(%i1 : i1) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   scf.if %i1 {
     // CHECK-NEXT: offset = 2048, size = 1024
-    %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: offset = 2048, size = 1024
-    %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   // CHECK-NEXT: offset = 0, size = 512
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst3 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 2048, size = 1024
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst3 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst3 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 3072
 }
@@ -377,28 +377,28 @@ tt.func @if(%i1 : i1) {
 // CHECK-LABEL: if_else
 tt.func @if_else(%i1 : i1) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 1024, size = 512
-  %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   scf.if %i1 {
     // CHECK-NEXT: offset = 2048, size = 1024
-    %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: offset = 2048, size = 1024
-    %b = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %b = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   } else {
     // CHECK-NEXT: offset = 2048, size = 512
-    %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: offset = 3072, size = 512
-    %cst3 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %cst3 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: offset = 4096, size = 1024
-    %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %cst2 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %cst3 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %cst2 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %cst3 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   // CHECK-NEXT: offset = 2048, size = 1024
-  %a = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst0 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  triton_gpu.local_dealloc %cst1 : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst0 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_dealloc %cst1 : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 5120
 }
@@ -408,13 +408,13 @@ tt.func @if_else(%i1 : i1) {
 // CHECK-LABEL: for
 tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 8192
-  %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 8192, size = 8192
-  %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 16384, size = 8192
-  %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
-    scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+    scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
   // CHECK-NEXT: size = 24576
@@ -423,18 +423,18 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !t
 // CHECK-LABEL: for_if_slice
 tt.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   // CHECK: offset = 0, size = 8192
-  %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 8192, size = 8192
-  %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 16384, size = 8192
-  %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+  %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
     scf.if %i1 {
       %index = arith.constant 8 : i32
-      %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+      %cst0 = triton_gpu.memdesc_subview %a_shared[%index, %index] : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
       scf.yield
     }
-    scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
   // CHECK-NEXT: size = 24576
@@ -444,16 +444,16 @@ tt.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>
 // CHECK-LABEL: for_use_ancestor
 tt.func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   // CHECK: offset = 0, size = 8192
-  %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 8192, size = 8192
-  %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 16384, size = 8192
-  %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
-    %c0 = triton_gpu.memdesc_trans %c_shared_init {order=array<i32: 1,0>} : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable>
+  %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+    %c0 = triton_gpu.memdesc_trans %c_shared_init {order=array<i32: 1,0>} : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #A_SHARED_T, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: offset = 24576, size = 8192
-    %c1 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-    scf.yield %b_shared, %a_shared: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %c1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    scf.yield %b_shared, %a_shared: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
   // CHECK-NEXT: size = 32768
@@ -464,28 +464,28 @@ tt.func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<
 // CHECK-LABEL: for_for_if
 tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   // CHECK: offset = 0, size = 8192
-  %a_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 8192, size = 8192
-  %b_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %b_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: offset = 16384, size = 8192
-  %c_shared_init = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
-    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
-      %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> {
+  %c_shared_init = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>) {
+      %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> {
         // CHECK-NEXT: offset = 24576, size = 8192
-        %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-        scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
       } else {
         // CHECK-NEXT: offset = 32768, size = 8192
-        %cst1 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-        scf.yield %cst1 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        %cst1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+        scf.yield %cst1 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
       }
-      scf.yield %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+      scf.yield %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     }
-    scf.yield %a_shared, %b_shared, %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    scf.yield %a_shared, %b_shared, %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   // CHECK-NEXT: offset = 0, size = 8192
-  %cst2 = triton_gpu.local_alloc : () -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst2 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 40960
 }
@@ -497,7 +497,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
 // CHECK-LABEL: alloc1
 tt.func @alloc1(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 512
 }
@@ -505,7 +505,7 @@ tt.func @alloc1(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: alloc2
 tt.func @alloc2(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 1024
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
   // CHECK-NEXT: size = 1024
 }
@@ -514,10 +514,10 @@ tt.func @alloc2(%A : !tt.ptr<f16>) {
 tt.func @alloc3(%cond : i1) {
   scf.if %cond {
     // CHECK: offset = 0, size = 512
-    %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   } else {
     // CHECK-NEXT: offset = 0, size = 1024
-    %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
   // CHECK-NEXT: size = 1024
@@ -539,7 +539,7 @@ tt.func @alloc4(%A : !tt.ptr<f16>, %cond : i1) {
 // CHECK-LABEL: single_call
 tt.func @single_call(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   // CHECK-NEXT: virtual offset = 0, size = 512
   tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
@@ -550,7 +550,7 @@ tt.func @single_call(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: multiple_calls
 tt.func @multiple_calls(%A : !tt.ptr<f16>) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: virtual offset = 0, size = 512
   tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
@@ -565,9 +565,9 @@ tt.func @if_else_calls(%A : !tt.ptr<f16>, %cond : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   scf.if %cond {
     // CHECK: offset = 0, size = 512
-    %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: offset = 0, size = 1024
-    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x32xf16, #AL>) -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x32xf16, #AL>) -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK-NEXT: virtual offset = 0, size = 512
     tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
   } else {
@@ -582,7 +582,7 @@ tt.func @if_else_calls(%A : !tt.ptr<f16>, %cond : i1) {
 // CHECK-LABEL: for_calls
 tt.func @for_calls(%A : !tt.ptr<f16>, %cond : i1) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   %lb = arith.constant 0 : index
   %ub = arith.constant 10 : index
@@ -598,7 +598,7 @@ tt.func @for_calls(%A : !tt.ptr<f16>, %cond : i1) {
 // CHECK-LABEL: call_graph_1
 tt.func @call_graph_1(%A : !tt.ptr<f16>, %cond : i1) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: virtual offset = 0, size = 1024
   tt.call @alloc3(%cond) : (i1) -> ()
   tt.return
@@ -608,7 +608,7 @@ tt.func @call_graph_1(%A : !tt.ptr<f16>, %cond : i1) {
 // CHECK-LABEL: call_graph_2
 tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {
   // CHECK: offset = 0, size = 512
-  %cst0 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %cst0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK-NEXT: virtual offset = 0, size = 1024
   tt.call @alloc4(%A, %cond) : (!tt.ptr<f16>, i1) -> ()
   tt.return
diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
index 65d802d995..a2711ba98f 100644
--- a/test/Analysis/test-membar.mlir
+++ b/test/Analysis/test-membar.mlir
@@ -46,10 +46,10 @@ tt.func @raw_single_block(%A : !tt.ptr<f16>) {
   %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
   %0 = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #AL>
   %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr<f16>, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
@@ -59,14 +59,14 @@ tt.func @war_single_block(%A : !tt.ptr<f16>) {
   %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
   %0 = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #AL>
   %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr<f16>, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: triton_gpu.local_alloc
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   // CHECK: gpu.barrier
   // CHECK-NEXT: %4 = triton_gpu.local_alloc
-  %4 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %4 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   tt.return
 }
 
@@ -76,25 +76,25 @@ tt.func @war_single_block_local_store(%A : !tt.ptr<f16>) {
   %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
   %0 = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #AL>
   %1 = tt.load %0, %cst1, %cst2 : tensor<128x32x!tt.ptr<f16>, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: triton_gpu.local_alloc
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #AL>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_store
-  triton_gpu.local_store %1, %2 : tensor<128x32xf16, #AL> -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  triton_gpu.local_store %1, %2 : tensor<128x32xf16, #AL> -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   tt.return
 }
 
 // CHECK-LABEL: scratch
 tt.func @scratch(%arg: tensor<16x16xf16, #AL>) {
-  %cst0 = triton_gpu.local_alloc %arg : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %arg : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
   // CHECK: gpu.barrier
   // CHECK: tt.reduce
-  %1 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   %2 = "tt.reduce" (%1) ({
   ^bb0(%arg1: f16, %arg2: f16):
     %add = arith.addf %arg1, %arg2 : f16
@@ -105,34 +105,34 @@ tt.func @scratch(%arg: tensor<16x16xf16, #AL>) {
 
 // CHECK-LABEL: async_wait
 tt.func @async_wait(%arg: tensor<32x16xf16, #AL>) {
-  %cst0 = triton_gpu.local_alloc %arg : (tensor<32x16xf16, #AL>) -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %arg : (tensor<32x16xf16, #AL>) -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: triton_gpu.async_wait
   triton_gpu.async_wait {num = 4 : i32}
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %1 = triton_gpu.local_load %cst0 : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<32x16xf16, #AL>
+  %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<32x16xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: subview
 tt.func @subview() {
   %cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL>
-  %a = triton_gpu.local_alloc %cst0 : (tensor<32x16xf16, #AL>) -> !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a = triton_gpu.local_alloc %cst0 : (tensor<32x16xf16, #AL>) -> !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   %index = arith.constant 0 : i32
-  %0 = triton_gpu.memdesc_subview %a[%index, %index] : !tt.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %0 = triton_gpu.memdesc_subview %a[%index, %index] : !triton_gpu.memdesc<32x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_alloc
-  %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   tt.return
 }
 
 // CHECK-LABEL: trans
-tt.func @trans(%a: !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+tt.func @trans(%a: !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
   // CHECK-NOT: gpu.barrier
-  %b = triton_gpu.memdesc_trans %a {order=array<i32: 1,0>} : !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> !tt.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory>
+  %b = triton_gpu.memdesc_trans %a {order=array<i32: 1,0>} : !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<32x16xf16, #A_SHARED_T, #triton_gpu.shared_memory>
   tt.return
 }
 
@@ -142,31 +142,31 @@ tt.func @async_copy_global_to_local(%A : !tt.ptr<f16>, %i1 : i1) {
   %a_ptr = tt.splat %A : !tt.ptr<f16> -> tensor<16x16x!tt.ptr<f16>, #AL>
   %mask = tt.splat %i1 : i1 -> tensor<16x16xi1, #AL>
   %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %alloc = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %subview = triton_gpu.memdesc_subview %alloc[%index, %index, %index] : !tt.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %1 = triton_gpu.async_copy_global_to_local %a_ptr, %subview : tensor<16x16x!tt.ptr<f16>, #AL> -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %alloc = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %subview = triton_gpu.memdesc_subview %alloc[%index, %index, %index] : !triton_gpu.memdesc<1x16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %1 = triton_gpu.async_copy_global_to_local %a_ptr, %subview : tensor<16x16x!tt.ptr<f16>, #AL> -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %4 = triton_gpu.local_load %subview : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
+  %4 = triton_gpu.local_load %subview : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
   tt.return
 }
 // If branch inserted a barrier for %cst0, but else didn't, then the barrier should be inserted in the parent region
 // CHECK-LABEL: multi_blocks
 tt.func @multi_blocks(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   scf.if %i1 {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
     scf.yield
   } else {
-    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
     scf.yield
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %2 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %2 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
@@ -174,21 +174,21 @@ tt.func @multi_blocks(%i1 : i1) {
 // CHECK-LABEL: multi_blocks_join_barrier
 tt.func @multi_blocks_join_barrier(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   scf.if %i1 {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
     scf.yield
   } else {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %1 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
     scf.yield
   }
   // CHECK-NOT: gpu.barrier
   // CHECK: tt.return
-  %a_ = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %a_ = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
@@ -196,25 +196,25 @@ tt.func @multi_blocks_join_barrier(%i1 : i1) {
 // CHECK-LABEL: multi_blocks_yield
 tt.func @multi_blocks_yield(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a = scf.if %i1 -> (!tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a = scf.if %i1 -> (!triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-    %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-    scf.yield %1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   } else {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %2 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-    %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-    scf.yield %3 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %2 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %3 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
-  %a_ = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %a_ = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   // CHECK: triton_gpu.local_load
   // CHECK-NEXT: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %4 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %4 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
@@ -222,27 +222,27 @@ tt.func @multi_blocks_yield(%i1 : i1) {
 // CHECK-LABEL: multi_blocks_entry_no_shared
 tt.func @multi_blocks_entry_no_shared(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a = scf.if %i1 -> (!tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a = scf.if %i1 -> (!triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>) {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_alloc
     // CHECK-NEXT: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
     // CHECK-NEXT: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_alloc
-    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-    %0 = triton_gpu.local_load %cst1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-    %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-    scf.yield %1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_load %cst1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   } else {
     // CHECK-NOT: gpu.barrier
     // CHECK: triton_gpu.local_alloc
-    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-    scf.yield %cst1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %cst1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %cst1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %2 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %2 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
@@ -250,16 +250,16 @@ tt.func @multi_blocks_entry_no_shared(%i1 : i1) {
 // CHECK-LABEL: multi_blocks_noelse
 tt.func @multi_blocks_noelse(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   scf.if %i1 {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %0 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+    %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
     scf.yield
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %1 = triton_gpu.local_load %cst0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
@@ -267,39 +267,39 @@ tt.func @multi_blocks_noelse(%i1 : i1) {
 // CHECK-LABEL: multi_blocks_nested_scf
 tt.func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   scf.if %i1 {
     scf.if %i2 {
       // CHECK: gpu.barrier
       // CHECK-NEXT: triton_gpu.local_load
-      %0 = triton_gpu.local_load %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+      %0 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
       scf.yield
     }
     scf.yield
   } else {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %1 = triton_gpu.local_load %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %1 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
     scf.yield
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %2 = triton_gpu.local_load %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %2 = triton_gpu.local_load %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: for
 tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %a0 = triton_gpu.local_load %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %b0 = triton_gpu.local_load %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    scf.yield %b_shared, %a_shared, %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %a0 = triton_gpu.local_load %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %b0 = triton_gpu.local_load %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    scf.yield %b_shared, %a_shared, %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   tt.return
 }
@@ -309,24 +309,24 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !t
 // CHECK-LABEL: for_alias
 tt.func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %a0 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-  %b0 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-  %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+  %a0 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %b0 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_load
-    %a1 = triton_gpu.local_load %a_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %b1 = triton_gpu.local_load %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    scf.yield %c_shared, %a_shared, %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %a1 = triton_gpu.local_load %a_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %b1 = triton_gpu.local_load %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    scf.yield %c_shared, %a_shared, %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %r = triton_gpu.local_load %0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %r = triton_gpu.local_load %0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
@@ -335,63 +335,63 @@ tt.func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %
 // CHECK-LABEL: for_reuse
 tt.func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %a0 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-  %b0 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-  %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+  %a0 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %b0 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_alloc
-    %a1 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %b1 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %a1 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %b1 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_alloc
-    %a2 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %b2 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %2 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-    scf.yield %c_shared, %a_shared, %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %a2 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %b2 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %2 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %c_shared, %a_shared, %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %r = triton_gpu.local_load %0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %r = triton_gpu.local_load %0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: for_reuse_nested
 tt.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %a0 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-  %b0 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-  %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+  %a0 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %b0 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %0 = triton_gpu.local_alloc %a0 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
     // CHECK: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_alloc
-    %a1 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %b1 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-    %a_shared_next, %b_shared_next, %c_shared_next = scf.for %ivv = %lb to %ub step %step iter_args(%a_shared_nested = %a_shared_init, %b_shared_nested = %b_shared_init, %c_shared_nested = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+    %a1 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %b1 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    %1 = triton_gpu.local_alloc %a1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %a_shared_next, %b_shared_next, %c_shared_next = scf.for %ivv = %lb to %ub step %step iter_args(%a_shared_nested = %a_shared_init, %b_shared_nested = %b_shared_init, %c_shared_nested = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
       // CHECK: gpu.barrier
       // CHECK-NEXT:  triton_gpu.local_alloc
-      %a2 = triton_gpu.local_load %a_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-      %b2 = triton_gpu.local_load %b_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-      %2 = triton_gpu.local_alloc %a2 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-      scf.yield %c_shared_nested, %a_shared_nested, %b_shared_nested : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+      %a2 = triton_gpu.local_load %a_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+      %b2 = triton_gpu.local_load %b_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+      %2 = triton_gpu.local_alloc %a2 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+      scf.yield %c_shared_nested, %a_shared_nested, %b_shared_nested : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
     }
-    scf.yield %c_shared, %a_shared, %b_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %c_shared, %a_shared, %b_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   // CHECK: gpu.barrier
   // CHECK-NEXT:  triton_gpu.local_load
-  %r = triton_gpu.local_load %0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %r = triton_gpu.local_load %0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
@@ -399,25 +399,25 @@ tt.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<
 // CHECK-LABEL: for_for_if
 tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
-    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
-      %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> {
+  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+      %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> {
         // CHECK: gpu.barrier
         // CHECK-NEXT: triton_gpu.local_alloc
-        %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-        scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+        %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+        scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
       } else {
         // CHECK: gpu.barrier
         // CHECK-NEXT: triton_gpu.local_alloc
-        %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-        scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+        %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+        scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
       }
-      scf.yield %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+      scf.yield %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
     }
-    scf.yield %a_shared, %b_shared, %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    scf.yield %a_shared, %b_shared, %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   tt.return
 }
@@ -426,30 +426,30 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
 // CHECK-LABEL: for_if_for
 tt.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %c_shared_init = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   // CHECK: gpu.barrier
-  %c_blocked = triton_gpu.local_load %c_shared_init : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %c_blocked = triton_gpu.local_load %c_shared_init : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
 
-  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
-    %c_shared_next_next = scf.if %i1 -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> {
+  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+    %c_shared_next_next = scf.if %i1 -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> {
       // CHECK: gpu.barrier
       // CHECK-NEXT: triton_gpu.local_alloc
-      %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
-      scf.yield %cst0 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+      %cst0 = triton_gpu.local_alloc %cst : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+      scf.yield %cst0 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
     } else {
-      %c_shared_ = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
+      %c_shared_ = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>) {
         // CHECK: gpu.barrier
         // CHECK-NEXT: triton_gpu.local_load
-        %c_blocked_next = triton_gpu.local_load %c_shared_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-        scf.yield %c_shared : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+        %c_blocked_next = triton_gpu.local_load %c_shared_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+        scf.yield %c_shared : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
       }
-      scf.yield %c_shared_ : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+      scf.yield %c_shared_ : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
     }
     // CHECK-NOT: gpu.barrier
-    %b_blocked_next = triton_gpu.local_load %b_shared: !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
-    scf.yield %a_shared, %b_shared, %c_shared_next_next : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %b_blocked_next = triton_gpu.local_load %b_shared: !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+    scf.yield %a_shared, %b_shared, %c_shared_next_next : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>, !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   }
   tt.return
 }
@@ -457,65 +457,65 @@ tt.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
 // CHECK-LABEL: cf_if
 tt.func @cf_if(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   cf.cond_br %i1, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %0 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %0 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   cf.br ^bb2
 ^bb2:  // 2 preds: ^bb0, ^bb1
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %1 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %1 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: cf_if_else
 tt.func @cf_if_else(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   cf.cond_br %i1, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %0 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-  %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-  cf.br ^bb3(%1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>)
+  %0 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %1 = triton_gpu.local_alloc %0 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  cf.br ^bb3(%1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>)
 ^bb2:  // pred: ^bb0
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %2 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-  %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-  cf.br ^bb3(%3 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>)
-^bb3(%arg: !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>):  // 2 preds: ^bb1, ^bb2
+  %2 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %3 = triton_gpu.local_alloc %2 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  cf.br ^bb3(%3 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>)
+^bb3(%arg: !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>):  // 2 preds: ^bb1, ^bb2
   cf.br ^bb4
 ^bb4:  // pred: ^bb3
   // CHECK: triton_gpu.local_load
-  %4 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %4 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %5 = triton_gpu.local_load %arg : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %5 = triton_gpu.local_load %arg : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: cf_if_else_return
 tt.func @cf_if_else_return(%i1 : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
-  %b = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %a = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %b = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   cf.cond_br %i1, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %0 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-  %1 = triton_gpu.local_load %b : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %0 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %1 = triton_gpu.local_load %b : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 ^bb2:  // pred: ^bb0
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %2 = triton_gpu.local_load %a : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
-  %3 = triton_gpu.local_load %b : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %2 = triton_gpu.local_load %a : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
+  %3 = triton_gpu.local_load %b : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
@@ -524,9 +524,9 @@ tt.func @atomic_scalar(%arg3: !tt.ptr<i32>) -> i32 {
   // CHECK-NOT: gpu.barrier
   %c0_i32 = arith.constant 0 : i32
   %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return %4 : i32
 }
 
@@ -534,11 +534,11 @@ tt.func @atomic_scalar(%arg3: !tt.ptr<i32>) -> i32 {
 tt.func @atomic_scalar_no_use(%arg3: !tt.ptr<i32>) {
   %c0_i32 = arith.constant 0 : i32
   %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL>
-  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %2 = triton_gpu.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
   // CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %3 = triton_gpu.local_load %2 : !tt.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
+  %3 = triton_gpu.local_load %2 : !triton_gpu.memdesc<128x32xf16, #A_SHARED, #triton_gpu.shared_memory> -> tensor<128x32xf16, #AL>
   tt.return
 }
 
@@ -549,38 +549,38 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 :
 // CHECK-LABEL: convert_layout1
 tt.func @convert_layout1(%A : !tt.ptr<f16>) {
   // CHECK-NOT: gpu.barrier
-  %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
+  %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: convert_layout2
 tt.func @convert_layout2(%A : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
-  %1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+  %1 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   // CHECK: triton_gpu.local_load
   // CHECK-NEXT: gpu.barrier
   // CHECK: triton_gpu.local_load
-  %3 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
-  %4 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
+  %3 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
+  %4 = triton_gpu.local_load %1 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
   tt.return
 }
 
 // CHECK-LABEL: convert_layout3
 tt.func @convert_layout3(%cond : i1) {
   scf.if %cond {
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK: triton_gpu.local_load
     // CHECK-NOT: gpu.barrier
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #AL>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x64xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #AL>
   } else {
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
     // CHECK: triton_gpu.local_load
     // CHECK-NEXT: gpu.barrier
     // CHECK-NEXT: triton_gpu.local_alloc
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
-    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #AL>
+    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory, mutable>
   }
   tt.return
 }
@@ -619,7 +619,7 @@ tt.func @single_call_no_sync(%A : !tt.ptr<f16>) {
 // CHECK-LABEL: multiple_calls
 tt.func @multiple_calls(%A : !tt.ptr<f16>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   tt.call @convert_layout1(%A) : (!tt.ptr<f16>) -> ()
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   tt.call @convert_layout2(%A) : (!tt.ptr<f16>) -> ()
@@ -631,12 +631,12 @@ tt.func @if_else_calls(%A : !tt.ptr<f16>, %cond : i1) {
   scf.if %cond {
     %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
     %cst_ = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
-    %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
     // CHECK: gpu.barrier
     // CHECK-NEXT: tt.call
     // CHECK-NEXT: gpu.barrier
     tt.call @convert_layout1(%A) : (!tt.ptr<f16>) -> ()
-    %cst1 = triton_gpu.local_alloc %cst_ : (tensor<16x32xf16, #AL>) -> !tt.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory>
+    %cst1 = triton_gpu.local_alloc %cst_ : (tensor<16x32xf16, #AL>) -> !triton_gpu.memdesc<16x32xf16, #A_SHARED, #triton_gpu.shared_memory>
   } else {
     %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
     // CHECK: tt.call
@@ -649,7 +649,7 @@ tt.func @if_else_calls(%A : !tt.ptr<f16>, %cond : i1) {
 // CHECK-LABEL: for_calls
 tt.func @for_calls(%A : !tt.ptr<f16>, %cond : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
   %lb = arith.constant 0 : index
   %ub = arith.constant 10 : index
@@ -665,7 +665,7 @@ tt.func @for_calls(%A : !tt.ptr<f16>, %cond : i1) {
 // CHECK-LABEL: call_graph_1
 tt.func @call_graph_1(%A : !tt.ptr<f16>, %cond : i1) {
   %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>  // CHECK: gpu.barrier
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>  // CHECK: gpu.barrier
   // CHECK-NEXT: tt.call
   tt.call @convert_layout3(%cond) : (i1) -> ()
   tt.return
@@ -677,7 +677,7 @@ tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {
   tt.call @convert_layout4(%A, %cond) : (!tt.ptr<f16>, i1) -> ()
   // CHECK: tt.call
   // CHECK-NEXT: gpu.barrier
-  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !tt.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
+  %cst0 = triton_gpu.local_alloc %cst : (tensor<16x16xf16, #AL>) -> !triton_gpu.memdesc<16x16xf16, #A_SHARED, #triton_gpu.shared_memory>
   tt.return
 }
 
@@ -694,8 +694,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   tt.func public @kernel(%arg3: !tt.ptr<i32>, %arg4: !tt.ptr<f16>, %arg12: tensor<32x128xf16, #blocked>, %arg13: tensor<32x128xf32, #blocked>, %arg14: tensor<32x32xf16, #blocked1>) {
     %c0_i32 = arith.constant 0 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked>
-    %37 = triton_gpu.local_alloc %arg14 {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked1>) -> !tt.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory>
-    %58 = triton_gpu.local_alloc %arg12 : (tensor<32x128xf16, #blocked>) -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory>
+    %37 = triton_gpu.local_alloc %arg14 {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked1>) -> !triton_gpu.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory>
+    %58 = triton_gpu.local_alloc %arg12 : (tensor<32x128xf16, #blocked>) -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory>
     cf.br ^bb1
   ^bb1:  // 2 preds: ^bb0, ^bb1
     %59 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
@@ -703,8 +703,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     cf.cond_br %60, ^bb1, ^bb2
   ^bb2:  // pred: ^bb1
     %72 = triton_gpu.convert_layout %arg13 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #mma>
-    %73 = triton_gpu.local_load %37 : !tt.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-    %74 = triton_gpu.local_load %58 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %73 = triton_gpu.local_load %37 : !triton_gpu.memdesc<32x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %74 = triton_gpu.local_load %58 : !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
     %75 = tt.dot %73, %74, %72, inputPrecision = tf32 : tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x128xf32, #mma>
     %76 = triton_gpu.convert_layout %75 {allocation.offset = 0 : i32} : tensor<32x128xf32, #mma> -> tensor<32x128xf32, #blocked>
     %77 = arith.truncf %76 : tensor<32x128xf32, #blocked> to tensor<32x128xf16, #blocked>
@@ -725,8 +725,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 tt.func @tma_special_cases(%arg1: !tt.ptr<i8, 0>) -> (tensor<256x64xf16, #blocked>){
   %true = arith.constant 1 : i1
   %c0 = arith.constant 0 : i32
-  %barrier = triton_gpu.local_alloc  : () -> !tt.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable>
-  %alloc = triton_gpu.local_alloc  : () -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
+  %barrier = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable>
+  %alloc = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
   //      CHECK: triton_nvidia_gpu.init_barrier
   // CHECK-NEXT: triton_nvidia_gpu.init_barrier
   triton_nvidia_gpu.init_barrier %barrier, 1 : <1xi64, #shared1, #triton_gpu.shared_memory, mutable>
@@ -749,7 +749,7 @@ tt.func @tma_special_cases(%arg1: !tt.ptr<i8, 0>) -> (tensor<256x64xf16, #blocke
   triton_nvidia_gpu.wait_barrier %barrier, %c0 : <1xi64, #shared1, #triton_gpu.shared_memory, mutable>
 
   // CHECK-NEXT: triton_gpu.local_load
-  %t = triton_gpu.local_load %alloc : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
+  %t = triton_gpu.local_load %alloc : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
 
   // CHECK-NEXT: triton_nvidia_gpu.barrier_expect
   // CHECK-NEXT: gpu.barrier
@@ -780,8 +780,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 tt.func @tma_special_cases_cf(%arg1: !tt.ptr<i8, 0>, %i1 : i1, %arg2: tensor<256x64xf16, #blocked>) -> (tensor<256x64xf16, #blocked>){
   %true = arith.constant 1 : i1
   %c0 = arith.constant 0 : i32
-  %barrier = triton_gpu.local_alloc  : () -> !tt.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable>
-  %alloc = triton_gpu.local_alloc  : () -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
+  %barrier = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1xi64, #shared1, #triton_gpu.shared_memory, mutable>
+  %alloc = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
   // CHECK: cf.cond_br
   scf.if %i1 {
     //  CHECK-NOT: gpu.barrier
@@ -797,12 +797,12 @@ tt.func @tma_special_cases_cf(%arg1: !tt.ptr<i8, 0>, %i1 : i1, %arg2: tensor<256
     //  CHECK-NOT: gpu.barrier
     //      CHECK: triton_gpu.local_store
     // CHECK-NEXT: cf.br
-    triton_gpu.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
     scf.yield
   }
   //      CHECK: gpu.barrier
   // CHECK-NEXT: triton_gpu.local_load
-  %t = triton_gpu.local_load %alloc : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
+  %t = triton_gpu.local_load %alloc : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
   tt.return %t : tensor<256x64xf16, #blocked>
 }
 }
diff --git a/test/Conversion/amd/compute-base-ptr.mlir b/test/Conversion/amd/compute-base-ptr.mlir
index 809e5a8699..c62f7bfb6c 100644
--- a/test/Conversion/amd/compute-base-ptr.mlir
+++ b/test/Conversion/amd/compute-base-ptr.mlir
@@ -7,10 +7,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK-LABEL: @local_load_offset
   tt.func @local_load_offset(%arg0: tensor<16x16xf16, #mma>) {
     %0 = triton_gpu.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked> loc(#loc1)
-    %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> loc(#loc2)
+    %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> loc(#loc2)
     // This catches base ptr calculation in the computeBasePtr, checks if the gep has correct element type.
     // CHECK: llvm.getelementptr {{.*}} (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16 local_load:3:0
-    %2 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> loc(#loc3)
+    %2 = triton_gpu.local_load %1 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> loc(#loc3)
     tt.return
   }
 }
diff --git a/test/Conversion/amd/decompose-unsupported-conversions.mlir b/test/Conversion/amd/decompose-unsupported-conversions.mlir
index 1bd288449f..9e6acf2e4b 100644
--- a/test/Conversion/amd/decompose-unsupported-conversions.mlir
+++ b/test/Conversion/amd/decompose-unsupported-conversions.mlir
@@ -8,7 +8,7 @@
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx1130", "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func @wmma_to_wmma_dot_op(%arg0: tensor<16x16xf16, #mma>) {
     // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<16x16xf16, #[[$WMMA]]> -> tensor<16x16xf16, #[[$BLOCKED]]>
-    // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory>
+    // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !triton_gpu.memdesc<16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory>
     // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$WMMA]], kWidth = 16}>>
     %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
     tt.return
@@ -25,7 +25,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func @wmma_to_wmma_dot3d_op(%arg0: tensor<2x16x16xf16, #mma>) {
     // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<2x16x16xf16, #[[$WMMA]]> -> tensor<2x16x16xf16, #[[$BLOCKED]]>
-    // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<2x16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory>
+    // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !triton_gpu.memdesc<2x16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory>
     // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$WMMA]], kWidth = 16}>>
     %0 = triton_gpu.convert_layout %arg0 : tensor<2x16x16xf16, #mma> -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
     tt.return
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
index d9a37b5c75..98d97f5cce 100644
--- a/test/Conversion/amd/tritongpu_to_llvm.mlir
+++ b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -44,19 +44,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK-LABEL: small_mfma_tensor_conversions
   tt.func public @small_mfma_tensor_conversions(%arg0: tensor<16x16xf16, #mfma>, %arg1: tensor<16x16x!tt.ptr<f32>, #mfma>) {
     // CHECK-NOT: triton_gpu.convert_layout
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #mfma>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #mfma>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
     // CHECK-4: store {{.*}} vector<4xf16>
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop0>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop0>
     // CHECK-2: load {{.*}} vector<4xf16>
-    %2 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop1>
+    %2 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop1>
     // CHECK-8: load {{.*}} vector<1xf16>
-    %3 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #mfma>
+    %3 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #mfma>
     // CHECK-4: load {{.*}} vector<4xf16>
     %4 = tt.fp_to_fp %3 : tensor<16x16xf16, #mfma> -> tensor<16x16xf32, #mfma>
 
     %5 = tt.dot %1, %2, %4 : tensor<16x16xf16, #dotop0> * tensor<16x16xf16, #dotop1> -> tensor<16x16xf32, #mfma>
     // Store result to prevent DCE from removing all conversion related code
-    %6 = triton_gpu.local_alloc %5 : (tensor<16x16xf32, #mfma>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory>
+    %6 = triton_gpu.local_alloc %5 : (tensor<16x16xf32, #mfma>) -> !triton_gpu.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory>
     tt.return
   }
 }
diff --git a/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir b/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir
index 5eb856bb99..e7dcb873d0 100644
--- a/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir
+++ b/test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir
@@ -5,22 +5,22 @@
 #mma2 = #triton_gpu.amd_wmma<{version = 2, warpsPerCTA = [2, 2]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   //  CHECK-LABEL: wmma1_dot_operand
-  tt.func @wmma1_dot_operand(%arg0: !tt.memdesc<64x64xf16, #shared>) {
+  tt.func @wmma1_dot_operand(%arg0: !triton_gpu.memdesc<64x64xf16, #shared>) {
     // 2 CTA * 4 rep * load_per_thread_per_instr
     // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xf16>
-    %0 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>
+    %0 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>
     // CHECK-COUNT-128: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<1xf16>
-    %1 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>
+    %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>
     tt.return
   }
 
   //  CHECK-LABEL: wmma2_dot_operand
-  tt.func @wmma2_dot_operand(%arg0: !tt.memdesc<64x64xf16, #shared>) {
+  tt.func @wmma2_dot_operand(%arg0: !triton_gpu.memdesc<64x64xf16, #shared>) {
     // 2 CTA * 4 rep * load_per_thread_per_instr
     // CHECK-COUNT-8: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<8xf16>
-    %0 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma2, kWidth = 8}>>
+    %0 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma2, kWidth = 8}>>
     // CHECK-COUNT-64: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<1xf16>
-    %1 = triton_gpu.local_load %arg0 : !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma2, kWidth = 8}>>
+    %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma2, kWidth = 8}>>
     tt.return
   }
 
@@ -105,11 +105,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #mma1 = #triton_gpu.amd_wmma<{version = 1, warpsPerCTA = [2, 1, 4]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: wmma_dot_operand3d
-  tt.func @wmma_dot_operand3d(%arg0: !tt.memdesc<4x16x32xf16, #shared>) {
+  tt.func @wmma_dot_operand3d(%arg0: !triton_gpu.memdesc<4x16x32xf16, #shared>) {
     // CHECK-COUNT-4: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xf16>
-    %0 = triton_gpu.local_load %arg0 : !tt.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>
+    %0 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>
     // CHECK-COUNT-32: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<1xf16>
-    %1 = triton_gpu.local_load %arg0 : !tt.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>
+    %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<4x16x32xf16, #shared> -> tensor<4x16x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>
     tt.return
   }
 
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index b2b64014b0..3f2fd578da 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -447,7 +447,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK: llvm.mlir.addressof @global_smem
     // CHECK-NEXT: llvm.getelementptr
     // CHECK-NEXT: llvm.mlir.constant
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory, mutable>
     tt.return
   }
 }
@@ -477,8 +477,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK-NEXT: llvm.getelementptr
     %index = arith.constant 1 : i32
     %zero = arith.constant 0 : i32
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable>
-    %1 = triton_gpu.memdesc_subview %0[%index, %zero, %zero] : !tt.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x32xf32, #shared0, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable>
+    %1 = triton_gpu.memdesc_subview %0[%index, %zero, %zero] : !triton_gpu.memdesc<128x16x32xf32, #shared0, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x32xf32, #shared0, #triton_gpu.shared_memory, mutable>
     tt.return
   }
 }
@@ -509,10 +509,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     %24 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1d0>
     %59 = tt.addptr %58, %24 : tensor<64x!tt.ptr<i64>, #slice1d0>, tensor<64xi32, #slice1d0>
     %66 = tt.addptr %59, %cst_2 : tensor<64x!tt.ptr<i64>, #slice1d0>, tensor<64xi32, #slice1d0>
-    %71 = triton_gpu.local_alloc : () -> !tt.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable>
+    %71 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable>
     %subview = triton_gpu.memdesc_subview %71[%c0_i32, %c0_i32] :
-      !tt.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable> ->
-      !tt.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable>
+      !triton_gpu.memdesc<2x64xi64, #shared2D, #triton_gpu.shared_memory, mutable> ->
+      !triton_gpu.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable>
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
     // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
@@ -523,7 +523,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
     // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
     // CHECK: cp.async.commit_group
-    %73 = triton_gpu.async_copy_global_to_local %66, %subview : tensor<64x!tt.ptr<i64>, #slice1d0> -> !tt.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable>
+    %73 = triton_gpu.async_copy_global_to_local %66, %subview : tensor<64x!tt.ptr<i64>, #slice1d0> -> !triton_gpu.memdesc<64xi64, #shared1D, #triton_gpu.shared_memory, mutable>
     triton_gpu.async_commit_group %73
     tt.return
   }
@@ -556,14 +556,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x64xi32, #AL>
     %a_init = tt.splat %arg0 : !tt.ptr<f32> -> tensor<16x64x!tt.ptr<f32>, #AL>
     %a_ptr = tt.addptr %a_init, %off : tensor<16x64x!tt.ptr<f32>, #AL>, tensor<16x64xi32, #AL>
-    %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable>
+    %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable>
     %index = arith.constant 1 : i32
 
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@${{.*}} cp.async.cg.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x10, 0x10;"
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@${{.*}} cp.async.cg.shared.global [ ${{.*}} + 16 ], [ ${{.*}} + 0 ], 0x10, 0x10;"
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
     // CHECK-SAME: cp.async.commit_group
-    %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x64x!tt.ptr<f32>, #AL> -> !tt.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable>
+    %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x64x!tt.ptr<f32>, #AL> -> !triton_gpu.memdesc<16x64xf32, #A, #triton_gpu.shared_memory, mutable>
     triton_gpu.async_commit_group
     tt.return
   }
@@ -596,7 +596,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x32xi32, #AL>
     %a_init = tt.splat %arg0 : !tt.ptr<f32> -> tensor<16x32x!tt.ptr<f32>, #AL>
     %a_ptr = tt.addptr %a_init, %off : tensor<16x32x!tt.ptr<f32>, #AL>, tensor<16x32xi32, #AL>
-    %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable>
+    %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable>
     %index = arith.constant 1 : i32
 
     // CHECK: llvm.inline_asm
@@ -609,7 +609,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4
     // CHECK: llvm.inline_asm
     // CHECK-SAME: cp.async.commit_group
-    %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x32x!tt.ptr<f32>, #AL> -> !tt.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable>
+    %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<16x32x!tt.ptr<f32>, #AL> -> !triton_gpu.memdesc<16x32xf32, #A, #triton_gpu.shared_memory, mutable>
     triton_gpu.async_commit_group
     tt.return
   }
@@ -641,7 +641,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<32x32xi32, #AL>
     %a_init = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x32x!tt.ptr<f32>, #AL>
     %a_ptr = tt.addptr %a_init, %off : tensor<32x32x!tt.ptr<f32>, #AL>, tensor<32x32xi32, #AL>
-    %tensor = triton_gpu.local_alloc : () -> !tt.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable>
+    %tensor = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable>
     %index = arith.constant 1 : i32
 
     // CHECK: llvm.mlir.constant(0 : i32) : i32
@@ -665,7 +665,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4
     // CHECK: llvm.inline_asm
     // CHECK-SAME: cp.async.commit_group
-    %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<32x32x!tt.ptr<f32>, #AL> -> !tt.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable>
+    %a = triton_gpu.async_copy_global_to_local %a_ptr, %tensor : tensor<32x32x!tt.ptr<f32>, #AL> -> !triton_gpu.memdesc<32x32xf32, #A, #triton_gpu.shared_memory, mutable>
     triton_gpu.async_commit_group
     tt.return
   }
@@ -773,14 +773,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: convert_dot
   tt.func @convert_dot(%A: tensor<16x16xf16, #blocked0>, %B: tensor<16x16xf16, #blocked0>) {
-    %AA = triton_gpu.local_alloc %A : (tensor<16x16xf16, #blocked0>) -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory>
-    %BB = triton_gpu.local_alloc %B : (tensor<16x16xf16, #blocked0>) -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory>
+    %AA = triton_gpu.local_alloc %A : (tensor<16x16xf16, #blocked0>) -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory>
+    %BB = triton_gpu.local_alloc %B : (tensor<16x16xf16, #blocked0>) -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory>
     // CHECK: llvm.inline_asm
     // CHECK: ldmatrix.sync.aligned.m8n8.x4
     // CHECK: llvm.inline_asm
     // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4
-    %AA_DOT = triton_gpu.local_load %AA : !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a>
-    %BB_DOT = triton_gpu.local_load %BB : !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_b>
+    %AA_DOT = triton_gpu.local_load %AA : !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a>
+    %BB_DOT = triton_gpu.local_load %BB : !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_b>
     %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma0>
 
     // CHECK: llvm.inline_asm
@@ -812,12 +812,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: convert_dot_fp8
   tt.func @convert_dot_fp8(%A: tensor<16x16xf8E5M2, #blocked0>, %B: tensor<16x16xf8E5M2, #blocked0>) {
-    %AA = triton_gpu.local_alloc %A : (tensor<16x16xf8E5M2, #blocked0>) -> !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory>
-    %BB = triton_gpu.local_alloc %B : (tensor<16x16xf8E5M2, #blocked0>) -> !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory>
+    %AA = triton_gpu.local_alloc %A : (tensor<16x16xf8E5M2, #blocked0>) -> !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory>
+    %BB = triton_gpu.local_alloc %B : (tensor<16x16xf8E5M2, #blocked0>) -> !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory>
     // CHECK: llvm.inline_asm
     // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4
-    %AA_DOT = triton_gpu.local_load %AA : !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_a>
-    %BB_DOT = triton_gpu.local_load %BB : !tt.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_b>
+    %AA_DOT = triton_gpu.local_load %AA : !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_a>
+    %BB_DOT = triton_gpu.local_load %BB : !triton_gpu.memdesc<16x16xf8E5M2, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf8E5M2, #dot_operand_b>
     %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma0>
 
     // CHECK: llvm.inline_asm
@@ -1054,7 +1054,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     // CHECK-SAME: !llvm.ptr<3>
     // CHECK: llvm.store
     // CHECK-SAME: !llvm.ptr<3>
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
     tt.return
   }
 }
@@ -1111,11 +1111,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma, kWidth=2}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   tt.func @matmul_kernel_dot_operand_layout(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-  %a:!tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory>, %b:!tt.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory>) {
+  %a:!triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory>, %b:!triton_gpu.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
     // CHECK: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-    %a_mat = triton_gpu.local_load %a : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x32xf16, #dot_operand_a>
-    %b_mat = triton_gpu.local_load %b : !tt.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x256xf16, #dot_operand_b>
+    %a_mat = triton_gpu.local_load %a : !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x32xf16, #dot_operand_a>
+    %b_mat = triton_gpu.local_load %b : !triton_gpu.memdesc<32x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<32x256xf16, #dot_operand_b>
 
     %28 = tt.dot %a_mat, %b_mat, %cst : tensor<128x32xf16, #dot_operand_a> * tensor<32x256xf16, #dot_operand_b> -> tensor<128x256xf32, #mma>
     %38 = triton_gpu.convert_layout %28 : tensor<128x256xf32, #mma> -> tensor<128x256xf32, #blocked>
@@ -1135,11 +1135,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#blocked}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   tt.func @matmul_fmadot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-  %a:!tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) {
+  %a:!triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
     // CHECK: llvm.intr.fmuladd
-    %a_mat = triton_gpu.local_load %a : !tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a>
-    %b_mat = triton_gpu.local_load %b : !tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b>
+    %a_mat = triton_gpu.local_load %a : !triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a>
+    %b_mat = triton_gpu.local_load %b : !triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b>
 
     %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = ieee : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #blocked>
     %30 = tt.splat %ptr : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>, #blocked>
@@ -1159,7 +1159,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   // CHECK-LABEL: matmul_tf32dot
   tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-  %a:!tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) {
+  %a:!triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory>, %b:!triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
     // CHECK: llvm.inline_asm
     // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
@@ -1167,8 +1167,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK: llvm.inline_asm
     // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16
     // CHECK-SAME: (i32, i32, i32, i32)
-    %a_mat = triton_gpu.local_load %a : !tt.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a>
-    %b_mat = triton_gpu.local_load %b : !tt.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b>
+    %a_mat = triton_gpu.local_load %a : !triton_gpu.memdesc<32x16xf32, #shared, #triton_gpu.shared_memory> -> tensor<32x16xf32, #dot_operand_a>
+    %b_mat = triton_gpu.local_load %b : !triton_gpu.memdesc<16x32xf32, #shared, #triton_gpu.shared_memory> -> tensor<16x32xf32, #dot_operand_b>
 
     // CHECK: llvm.inline_asm
     // CHECK-SAME: mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
@@ -1391,8 +1391,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
   // CHECK-LABEL: test_base_index_cache
   tt.func @test_base_index_cache(%arg0: tensor<128x32xf32, #blocked0>) {
     // CHECK: nvvm.read.ptx.sreg.tid.x
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
-    %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
     tt.return
   }
 }
@@ -1404,10 +1404,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
   // CHECK-LABEL: test_index_cache_different_block
   tt.func @test_index_cache_different_block(%arg0: tensor<128x32xf32, #blocked0>, %arg1: i1) {
     // CHECK: nvvm.read.ptx.sreg.tid.x
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
     cf.cond_br %arg1, ^bb1, ^bb2
     ^bb1:  // pred: ^bb0
-      %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !tt.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
+      %1 = triton_gpu.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !triton_gpu.memdesc<128x32xf32, #shared0, #triton_gpu.shared_memory>
       cf.br ^bb2
     ^bb2:  // 2 preds: ^bb0, ^bb1
       tt.return
@@ -1648,16 +1648,16 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   tt.func @i16_mma_layout(%f16_inp: tensor<16x16xf16, #blocked0>, %i16_inp: tensor<16x16xi16, #blocked0>) {
     // CHECK-LABEL: @i16_mma_layout
 
-    %f16_shared = triton_gpu.local_alloc %f16_inp : (tensor<16x16xf16, #blocked0>) -> !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory>
-    %i16_shared = triton_gpu.local_alloc %i16_inp : (tensor<16x16xi16, #blocked0>) -> !tt.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory>
+    %f16_shared = triton_gpu.local_alloc %f16_inp : (tensor<16x16xf16, #blocked0>) -> !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory>
+    %i16_shared = triton_gpu.local_alloc %i16_inp : (tensor<16x16xi16, #blocked0>) -> !triton_gpu.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory>
 
     // CHECK: llvm.inline_asm
     // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4
     // CHECK: llvm.inline_asm
     // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4
 
-    %f16_dot = triton_gpu.local_load %f16_shared : !tt.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a>
-    %i16_dot = triton_gpu.local_load %i16_shared : !tt.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xi16, #dot_operand_b>
+    %f16_dot = triton_gpu.local_load %f16_shared : !triton_gpu.memdesc<16x16xf16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dot_operand_a>
+    %i16_dot = triton_gpu.local_load %i16_shared : !triton_gpu.memdesc<16x16xi16, #shared0, #triton_gpu.shared_memory> -> tensor<16x16xi16, #dot_operand_b>
 
     // CHECK: llvm.sitofp %{{.*}} : i16 to f16
 
@@ -1720,8 +1720,8 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
   // CHECK: llvm.load
   // CHECK-SAME: {alignment = 8 : i64} : !llvm.ptr<3> -> vector<8xi8>
   // CHECK-NOT: llvm.load
-  tt.func public @vectorize_shmem_load(%shmem : !tt.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory>) {
-    %0 = triton_gpu.local_load %shmem : !tt.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory> -> tensor<16x16xi8, #blocked>
+  tt.func public @vectorize_shmem_load(%shmem : !triton_gpu.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory>) {
+    %0 = triton_gpu.local_load %shmem : !triton_gpu.memdesc<16x16xi8, #shared, #triton_gpu.shared_memory> -> tensor<16x16xi8, #blocked>
     tt.return
   }
 }
@@ -1736,7 +1736,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
   // CHECK-SAME: {alignment = 64 : i64} : vector<16xi32>, !llvm.ptr<3>
   // CHECK-NOT: llvm.store
   tt.func public @vectorize_shmem_store(%block : tensor<64x64xi32, #blocked>) {
-    %0 = triton_gpu.local_alloc %block : (tensor<64x64xi32, #blocked>) -> !tt.memdesc<64x64xi32, #shared, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %block : (tensor<64x64xi32, #blocked>) -> !triton_gpu.memdesc<64x64xi32, #shared, #triton_gpu.shared_memory>
     tt.return
   }
 }
@@ -1761,9 +1761,9 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 :
   // CHECK: llvm.extractelement {{.*}} : vector<8xbf16>
   tt.func public @test_local_load_bf16() {
     %c0_i32 = arith.constant 0 : i32
-    %19 = triton_gpu.local_alloc  : () -> !tt.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable>
-    %22 = triton_gpu.memdesc_subview %19[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable>
-    %39 = triton_gpu.local_load %22 : !tt.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<1x2048xbf16, #blocked>
+    %19 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable>
+    %22 = triton_gpu.memdesc_subview %19[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable>
+    %39 = triton_gpu.local_load %22 : !triton_gpu.memdesc<1x2048xbf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<1x2048xbf16, #blocked>
     %40 = arith.extf %39 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
     tt.return
   }
@@ -1777,8 +1777,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK: llvm.store
   tt.func public @test_local_store(%arg0: tensor<1xf32, #blocked>) {
     %c0_i32 = arith.constant 0 : i32
-    %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %arg0, %0 : tensor<1xf32, #blocked> -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %arg0, %0 : tensor<1xf32, #blocked> -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
     tt.return
   }
 }
@@ -1791,9 +1791,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK: llvm.store
   tt.func public @test_local_store_subview(%arg0: tensor<1xf32, #blocked>) {
     %c0_i32 = arith.constant 0 : i32
-    %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
-    %sv = triton_gpu.memdesc_subview %0[%c0_i32] : !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %arg0, %sv : tensor<1xf32, #blocked> -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc {allocation.offset = 0 : i32} : () -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
+    %sv = triton_gpu.memdesc_subview %0[%c0_i32] : !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %arg0, %sv : tensor<1xf32, #blocked> -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
     tt.return
   }
 }
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
index a8603f4a85..1f35d8fdd6 100644
--- a/test/Conversion/tritongpu_to_llvm_hopper.mlir
+++ b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -5,7 +5,7 @@
 #shared1 = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_high_precision_acc
-  tt.func @dot_high_precision_acc(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) {
+  tt.func @dot_high_precision_acc(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) {
     // CHECK: nvgpu.wgmma
     // CHECK-COUNT-128: llvm.fadd
     // CHECK: nvgpu.wgmma
@@ -16,7 +16,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     // CHECK-COUNT-128: llvm.fadd
     %m = triton_nvidia_gpu.warp_group_dot %a, %b, %c
       {maxNumImpreciseAcc = 32 : i32, inputPrecision = 0 : i32} :
-      !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma>
+      !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma>
     tt.return
   }
 }
@@ -28,7 +28,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
 #shared1 = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_low_precision_acc
-  tt.func @dot_low_precision_acc(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) {
+  tt.func @dot_low_precision_acc(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) {
     // CHECK: nvgpu.wgmma
     // CHECK-NOT: llvm.fadd
     // CHECK: nvgpu.wgmma
@@ -40,7 +40,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     // CHECK: llvm.return
     %m = triton_nvidia_gpu.warp_group_dot %a, %b, %c
       {maxNumImpreciseAcc = 129 : i32, inputPrecision = 0 : i32} :
-      !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma>
+      !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma>
     tt.return
   }
 }
@@ -52,7 +52,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
 #shared1 = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_mix_precision_acc
-  tt.func @dot_mix_precision_acc(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) {
+  tt.func @dot_mix_precision_acc(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared1>, %c: tensor<128x256xf32, #mma>) {
     // CHECK: nvgpu.wgmma
     // CHECK-NOT: llvm.fadd
     // CHECK: nvgpu.wgmma
@@ -64,7 +64,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     // CHECK: llvm.return
     %m = triton_nvidia_gpu.warp_group_dot %a, %b, %c
       {maxNumImpreciseAcc = 64 : i32, inputPrecision = 0 : i32} :
-      !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma>
+      !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x256xf8E5M2, #shared1> -> tensor<128x256xf32, #mma>
     tt.return
   }
 }
@@ -78,10 +78,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK-LABEL: @dot_zero_acc
   // Generate a wgmma with 2 sources.
   // CHECK: nvgpu.wgmma %{{.*}}, %{{.*}} {
-  tt.func @dot_zero_acc(%a: !tt.memdesc<128x64xf16, #shared>, %b: !tt.memdesc<64x64xf16, #shared1>) {
+  tt.func @dot_zero_acc(%a: !triton_gpu.memdesc<128x64xf16, #shared>, %b: !triton_gpu.memdesc<64x64xf16, #shared1>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
     %m = triton_nvidia_gpu.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, maxNumImpreciseAcc = 0 : i32} :
-      !tt.memdesc<128x64xf16, #shared> * !tt.memdesc<64x64xf16, #shared1> -> tensor<128x64xf32, #mma>
+      !triton_gpu.memdesc<128x64xf16, #shared> * !triton_gpu.memdesc<64x64xf16, #shared1> -> tensor<128x64xf32, #mma>
     tt.return
   }
 }
@@ -95,11 +95,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // Generate a wgmma where the first operand is a struct.
   // CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
   // CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
-  tt.func @dot_reg_operand_A(%a: tensor<128x64xf16, #mma>, %b: !tt.memdesc<64x64xf16, #shared>) {
+  tt.func @dot_reg_operand_A(%a: tensor<128x64xf16, #mma>, %b: !triton_gpu.memdesc<64x64xf16, #shared>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
     %opA = triton_gpu.convert_layout %a : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
     %m = triton_nvidia_gpu.warp_group_dot %opA, %b, %cst { inputPrecision = 0 : i32 }:
-      tensor<128x64xf16,  #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+      tensor<128x64xf16,  #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
     tt.return
   }
 }
@@ -114,10 +114,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // Generate a wgmma where the first operand is a struct.
   // CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
   // CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32}
-  tt.func @dot_reg_operand_A_fp8(%a: tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %b: !tt.memdesc<128x256xf8E5M2, #shared>) {
+  tt.func @dot_reg_operand_A_fp8(%a: tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %b: !triton_gpu.memdesc<128x256xf8E5M2, #shared>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma1>
     %m = triton_nvidia_gpu.warp_group_dot %a, %b, %cst { maxNumImpreciseAcc = 1073741824 : i32, inputPrecision = 0 : i32 } :
-      tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !tt.memdesc<128x256xf8E5M2, #shared> -> tensor<128x256xf32, #mma1>
+      tensor<128x128xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !triton_gpu.memdesc<128x256xf8E5M2, #shared> -> tensor<128x256xf32, #mma1>
     tt.return
   }
 }
@@ -128,10 +128,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 64, 16]}>
 #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  tt.func @dot_reg_operand_upcast(%a_desc: !tt.memdesc<128x64xi8, #shared>, %b: !tt.memdesc<64x64xf16, #shared>, %acc: tensor<128x64xf32, #mma>) {
-    %a_dotop = triton_gpu.local_load %a_desc : !tt.memdesc<128x64xi8, #shared> -> tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+  tt.func @dot_reg_operand_upcast(%a_desc: !triton_gpu.memdesc<128x64xi8, #shared>, %b: !triton_gpu.memdesc<64x64xf16, #shared>, %acc: tensor<128x64xf32, #mma>) {
+    %a_dotop = triton_gpu.local_load %a_desc : !triton_gpu.memdesc<128x64xi8, #shared> -> tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
     %a_casted = arith.sitofp %a_dotop : tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> to tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-    %res = triton_nvidia_gpu.warp_group_dot %a_casted, %b, %acc : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+    %res = triton_nvidia_gpu.warp_group_dot %a_casted, %b, %acc : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
     tt.return
   }
 }
@@ -220,10 +220,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
 // CHECK-LABEL: dot_zero_acc_operand
 // CHECK-COUNT-128: llvm.fadd
-  tt.func @dot_zero_acc_operand(%a: !tt.memdesc<128x128xf8E5M2, #shared>, %b: !tt.memdesc<128x128xf8E5M2, #shared1>) {
+  tt.func @dot_zero_acc_operand(%a: !triton_gpu.memdesc<128x128xf8E5M2, #shared>, %b: !triton_gpu.memdesc<128x128xf8E5M2, #shared1>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
     %m = triton_nvidia_gpu.warp_group_dot %a, %b, %cst {maxNumImpreciseAcc = 64 : i32, inputPrecision = 0 : i32} :
-      !tt.memdesc<128x128xf8E5M2, #shared> * !tt.memdesc<128x128xf8E5M2, #shared1> -> tensor<128x128xf32, #mma>
+      !triton_gpu.memdesc<128x128xf8E5M2, #shared> * !triton_gpu.memdesc<128x128xf8E5M2, #shared1> -> tensor<128x128xf32, #mma>
     tt.return
   }
 }
@@ -238,7 +238,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
   tt.func @distribute_to_shared_st_matrix(%a: tensor<128x128xf16, #mma>) {
     // CHECK-COUNT-16: nvgpu.stmatrix
     //          CHECK: llvm.return
-    %b = triton_gpu.local_alloc %a {allocation.offset = 0 : i32} : (tensor<128x128xf16, #mma>) -> !tt.memdesc<128x128xf16, #shared, mutable>
+    %b = triton_gpu.local_alloc %a {allocation.offset = 0 : i32} : (tensor<128x128xf16, #mma>) -> !triton_gpu.memdesc<128x128xf16, #shared, mutable>
     tt.return
   }
 }
diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir
index 0bcab369f7..c7cc5fa5db 100644
--- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir
+++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir
@@ -3,9 +3,9 @@
 #shared0 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   // CHECK-LABEL: init_barrier
-  tt.func @init_barrier(%alloc: !tt.memdesc<1xi64, #shared0>) {
+  tt.func @init_barrier(%alloc: !triton_gpu.memdesc<1xi64, #shared0>) {
     // CHECK: "@$0 mbarrier.init.shared::cta.b64 [$1], 1;", "b,r" %{{.*}}, %{{.*}} : (i1, !llvm.ptr<3>) -> !llvm.void
-    triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0>
+    triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0>
     tt.return
   }
 }
@@ -15,11 +15,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #shared0 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   // CHECK-LABEL: wait_barrier
-  tt.func @wait_barrier(%alloc: !tt.memdesc<1xi64, #shared0>, %phase: i32) {
+  tt.func @wait_barrier(%alloc: !triton_gpu.memdesc<1xi64, #shared0>, %phase: i32) {
     // CHECK: waitLoop:
     // CHECK: mbarrier.try_wait.parity.shared.b64
     // CHECK: @!P1 bra.uni waitLoop
-    triton_nvidia_gpu.wait_barrier %alloc, %phase : !tt.memdesc<1xi64, #shared0>
+    triton_nvidia_gpu.wait_barrier %alloc, %phase : !triton_gpu.memdesc<1xi64, #shared0>
     tt.return
   }
 }
@@ -35,8 +35,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK: "@$0 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$1], [$2, {$3, $4}], [$5];", "b,r,l,r,r,r" {{.*}} : (i1, !llvm.ptr<3>, !llvm.ptr<1>, i32, i32, !llvm.ptr<3>) -> !llvm.void
   // CHECK-NOT: cp.async.bulk.tensor.2d.shared
   // CHECK: return
-  tt.func @tma_copy_global_to_local(%tma: !tt.ptr<i64>, %alloc: !tt.memdesc<128x128xf32, #shared1, mutable>, %x: i32, %barrier: !tt.memdesc<1xi64, #shared0>, %pred: i1) {
-    triton_nvidia_gpu.async_tma_copy_global_to_local %tma[%x, %x] %alloc, %barrier, %pred : !tt.ptr<i64>, !tt.memdesc<1xi64, #shared0> -> !tt.memdesc<128x128xf32, #shared1, mutable>
+  tt.func @tma_copy_global_to_local(%tma: !tt.ptr<i64>, %alloc: !triton_gpu.memdesc<128x128xf32, #shared1, mutable>, %x: i32, %barrier: !triton_gpu.memdesc<1xi64, #shared0>, %pred: i1) {
+    triton_nvidia_gpu.async_tma_copy_global_to_local %tma[%x, %x] %alloc, %barrier, %pred : !tt.ptr<i64>, !triton_gpu.memdesc<1xi64, #shared0> -> !triton_gpu.memdesc<128x128xf32, #shared1, mutable>
     tt.return
   }
 }
@@ -50,7 +50,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK: "@$0 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$1, {$2, $3}], [$4];", "b,l,r,r,r" {{.*}} : (i1, !llvm.ptr<1>, i32, i32, !llvm.ptr<3>) -> !llvm.void
   // CHECK-NOT: cp.async.bulk.tensor.2d.global.shared::cta.bulk_group
   // CHECK: cp.async.bulk.commit_group
-  tt.func @tma_copy_local_to_global(%tma: !tt.ptr<i64>, %alloc: !tt.memdesc<128x128xf32, #shared1>, %x: i32) {
+  tt.func @tma_copy_local_to_global(%tma: !tt.ptr<i64>, %alloc: !triton_gpu.memdesc<128x128xf32, #shared1>, %x: i32) {
     triton_nvidia_gpu.async_tma_copy_local_to_global %tma[%x, %x] %alloc : <i64>, <128x128xf32, #shared1>
     tt.return
   }
@@ -74,7 +74,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   // CHECK-LABEL: expect_barrier
   // CHECK: @$0 mbarrier.arrive.expect_tx.shared.b64 _, [$1], 16384;
-  tt.func @expect_barrier(%barrier: !tt.memdesc<1xi64, #shared0, mutable>, %pred: i1) {
+  tt.func @expect_barrier(%barrier: !triton_gpu.memdesc<1xi64, #shared0, mutable>, %pred: i1) {
     triton_nvidia_gpu.barrier_expect %barrier, 16384, %pred : <1xi64, #shared0, mutable>
     tt.return
   }
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
index d121d285d3..c7fb41707e 100644
--- a/test/Triton/invalid.mlir
+++ b/test/Triton/invalid.mlir
@@ -277,9 +277,9 @@ tt.func public @fn(%arg0: tensor<2x4x8x16xf32, #blocked>, %arg1: tensor<16x32x64
 #shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], CTAsPerCGA = [1, 2], CTASplitNum = [2, 4], CTAOrder = [0, 1], hasLeadingOffset = true}>
 #shared3 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], CTAsPerCGA = [2, 1], CTASplitNum = [4, 2], CTAOrder = [1, 0], hasLeadingOffset = true}>
 module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 8 : i32, "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
-tt.func public @fn(%arg0: !tt.memdesc<2x4x8x16xf32, #shared>, %arg1: !tt.memdesc<16x32xf32, #shared2>) {
-    %a = triton_gpu.memdesc_trans %arg0 {order = array<i32: 1, 3, 2, 0>} : !tt.memdesc<2x4x8x16xf32, #shared> -> !tt.memdesc<4x16x8x2xf32, #shared1>
-    %b = triton_gpu.memdesc_trans %arg1 {order = array<i32: 1, 0>} : !tt.memdesc<16x32xf32, #shared2> -> !tt.memdesc<32x16xf32, #shared3>
+tt.func public @fn(%arg0: !triton_gpu.memdesc<2x4x8x16xf32, #shared>, %arg1: !triton_gpu.memdesc<16x32xf32, #shared2>) {
+    %a = triton_gpu.memdesc_trans %arg0 {order = array<i32: 1, 3, 2, 0>} : !triton_gpu.memdesc<2x4x8x16xf32, #shared> -> !triton_gpu.memdesc<4x16x8x2xf32, #shared1>
+    %b = triton_gpu.memdesc_trans %arg1 {order = array<i32: 1, 0>} : !triton_gpu.memdesc<16x32xf32, #shared2> -> !triton_gpu.memdesc<32x16xf32, #shared3>
     tt.return
 }
 }  // end module
diff --git a/test/TritonGPU/accumulator-init.mlir b/test/TritonGPU/accumulator-init.mlir
index 3026c5b162..e73934818a 100644
--- a/test/TritonGPU/accumulator-init.mlir
+++ b/test/TritonGPU/accumulator-init.mlir
@@ -11,14 +11,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK-LABEL: @constant_init
 // CHECK-DAG: %[[FALSE:.+]] = arith.constant false
 // CHECK: triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, %[[FALSE]]
-  tt.func @constant_init(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @constant_init(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       scf.yield %acc: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
@@ -27,14 +27,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK-LABEL: @constant_init_integer
 // CHECK-DAG: %[[FALSE:.+]] = arith.constant false
 // CHECK: triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, %[[FALSE]]
-  tt.func @constant_init_integer(%A: !tt.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xi32, #mma1> {
+  tt.func @constant_init_integer(%A: !triton_gpu.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xi32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0> : tensor<128x16xi32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xi32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !tt.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xi32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %cst_2 : !triton_gpu.memdesc<128x64xi8, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xi8, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xi32, #mma1>
       scf.yield %acc: tensor<128x16xi32, #mma1>
     }
     tt.return %17 : tensor<128x16xi32, #mma1>
@@ -53,14 +53,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: else
 // CHECK: scf.yield %[[ACC_NEXT]]
 // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]]
-  tt.func @if_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @if_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         scf.yield %cst_2 : tensor<128x16xf32, #mma1>
       } else {
@@ -84,14 +84,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: else
 // CHECK: scf.yield %[[ACC_NEXT]]
 // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]]
-  tt.func @if_after_mma_invert(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @if_after_mma_invert(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         scf.yield %acc : tensor<128x16xf32, #mma1>
       } else {
@@ -115,7 +115,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: scf.yield %[[ACC]]
 // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC_CND]], %[[USE_ACC_NEXT]]
 // CHECK: scf.yield {{.*}}, %[[TRUE]]
-  tt.func @if_before_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @if_before_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
@@ -127,7 +127,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       } else {
         scf.yield %arg4 : tensor<128x16xf32, #mma1>
       }
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       scf.yield %acc: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
@@ -146,7 +146,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: scf.yield %[[ACC]]
 // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC_CND]], %[[USE_ACC_NEXT]]
 // CHECK: scf.yield {{.*}}, %[[TRUE]]
-  tt.func @if_before_mma_invert(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @if_before_mma_invert(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
@@ -158,7 +158,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       } else {
         scf.yield %cst_2 : tensor<128x16xf32, #mma1>
       }
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       scf.yield %acc: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
@@ -173,14 +173,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC]], %[[USE_ACC]]
 // CHECK: %[[USE_ACC_NEXT:.*]] = arith.select %[[CND]], %[[FALSE]], %[[TRUE]]
 // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]]
-  tt.func @sel_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @sel_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1>
       scf.yield %acc_: tensor<128x16xf32, #mma1>
     }
@@ -196,7 +196,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: %[[USE_ACC_NEXT:.*]] = arith.select %[[CND]], %[[FALSE]], %[[USE_ACC]]
 // CHECK: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, %[[ACC]], %[[USE_ACC_NEXT]]
 // CHECK: scf.yield {{.*}}, %[[TRUE]]
-  tt.func @sel_before_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @sel_before_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
@@ -204,7 +204,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
       %acc_ = arith.select %cnd, %cst_2, %arg4 : tensor<128x16xf32, #mma1>
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       scf.yield %acc: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
@@ -230,7 +230,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: else
 // CHECK: scf.yield %[[ACC_NEXT]]
 // CHECK: scf.yield {{.*}}, %[[TRUE]]
-  tt.func @if_before_and_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @if_before_and_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
@@ -242,7 +242,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       } else {
         scf.yield %arg4 : tensor<128x16xf32, #mma1>
       }
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_0 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_0 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_1 = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         scf.yield %cst_2 : tensor<128x16xf32, #mma1>
       } else {
@@ -270,14 +270,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: else
 // CHECK: scf.yield %[[ACC_CND]]
 // CHECK: scf.yield {{.*}}, %[[USE_ACC_NEXT]]
-  tt.func @two_ifs_after_mma(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+  tt.func @two_ifs_after_mma(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_0 = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         scf.yield %cst_2 : tensor<128x16xf32, #mma1>
       } else {
@@ -296,15 +296,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // Check that we bail out in unsupported cases
 
 // CHECK-LABEL: @non_zero_init
-// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc
-  tt.func @non_zero_init(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc
+  tt.func @non_zero_init(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1>
       scf.yield %acc_: tensor<128x16xf32, #mma1>
     }
@@ -312,15 +312,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   }
 
 // CHECK-LABEL: @zero_init_dist_2
-// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc
-  tt.func @zero_init_dist_2(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc
+  tt.func @zero_init_dist_2(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17:2 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %cst_2) -> (tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg5 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg5 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1>
       scf.yield %acc_, %arg4: tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>
     }
@@ -328,8 +328,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   }
 
 // CHECK-LABEL: @if_defines_alternative
-// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc
-  tt.func @if_defines_alternative(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc
+  tt.func @if_defines_alternative(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %cst_3 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1>
@@ -337,7 +337,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         scf.yield %cst_2 : tensor<128x16xf32, #mma1>
       } else {
@@ -350,15 +350,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   }
 
 // CHECK-LABEL: @non_cond_override
-// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc
-  tt.func @non_cond_override(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc
+  tt.func @non_cond_override(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %cst_3 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = arith.addf %acc, %cst_3 : tensor<128x16xf32, #mma1>
       scf.yield %acc_: tensor<128x16xf32, #mma1>
     }
@@ -367,15 +367,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 
 // If the condition is a tensor skip the optimization.
 // CHECK-LABEL: @negative_sel_tensor
-// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !tt.memdesc
-  tt.func @negative_sel_tensor(%A: !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %cnd: tensor<128x16xi1, #mma1>) -> tensor<128x16xf32, #mma1> {
+// CHECK-NOT: %[[ACC_NEXT:.+]] = triton_nvidia_gpu.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !triton_gpu.memdesc
+  tt.func @negative_sel_tensor(%A: !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>, %B: !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>, %cnd: tensor<128x16xi1, #mma1>) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %acc_ = arith.select %cnd, %cst_2, %arg4 : tensor<128x16xi1, #mma1>, tensor<128x16xf32, #mma1>
-      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %acc = triton_nvidia_gpu.warp_group_dot %A, %B, %acc_ : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       scf.yield %acc: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir
index 5dfd0f2a5f..51353d31c3 100644
--- a/test/TritonGPU/amd/amd-reorder-instructions.mlir
+++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir
@@ -34,10 +34,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     %54:1 = scf.for %arg21 = %c0_i32 to %arg20 step %c128_i32 iter_args(%arg26 = %c0_i64) -> (i64)  : i32 {
       %73 = tt.splat %3 : !tt.ptr<f16> -> tensor<128x128x!tt.ptr<f16>, #blocked2>
       %74 = tt.load %73 : tensor<128x128x!tt.ptr<f16>, #blocked2>
-      %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory>
-      %76 = triton_gpu.local_load %75 : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
-      %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory>
-      %78 = triton_gpu.local_load %77 : !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>
+      %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory>
+      %76 = triton_gpu.local_load %75 : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
+      %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory>
+      %78 = triton_gpu.local_load %77 : !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>
       %79 = tt.dot %76, %78, %cst_2 : tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> -> tensor<256x128xf32, #mfma>
       %107 = arith.addi %arg26, %c128_i64 : i64
       scf.yield %107 : i64
@@ -78,10 +78,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
       %cst_2 = arith.constant dense<0.000000e+00> : tensor<256x128xf32, #mfma>
       %73 = tt.splat %3 : !tt.ptr<f16> -> tensor<128x128x!tt.ptr<f16>, #blocked2>
       %74 = tt.load %73 : tensor<128x128x!tt.ptr<f16>, #blocked2>
-      %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory>
-      %76 = triton_gpu.local_load %75 : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
-      %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory>
-      %78 = triton_gpu.local_load %77 : !tt.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>
+      %75 = triton_gpu.local_alloc %45 : (tensor<256x128xf16, #blocked1>) -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory>
+      %76 = triton_gpu.local_load %75 : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
+      %77 = triton_gpu.local_alloc %74 : (tensor<128x128xf16, #blocked2>) -> !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory>
+      %78 = triton_gpu.local_load %77 : !triton_gpu.memdesc<128x128xf16, #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>
       %79 = tt.dot %76, %78, %cst_2 : tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> -> tensor<256x128xf32, #mfma>
       %107 = arith.addi %arg26, %c128_i64 : i64
       scf.yield %107 : i64
@@ -104,10 +104,10 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
   tt.func public @order_load_alloc_local_load_local_store(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked>) attributes {noinline = false} {
     %9 = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
-    %10 = triton_gpu.local_alloc : () -> !tt.memdesc<32x32xf32, #shared, mutable>
-    triton_gpu.local_store %9, %10 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, mutable>
+    %10 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<32x32xf32, #shared, mutable>
+    triton_gpu.local_store %9, %10 : tensor<32x32xf32, #blocked> -> !triton_gpu.memdesc<32x32xf32, #shared, mutable>
     %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
-    %11 = triton_gpu.local_load %10 : !tt.memdesc<32x32xf32, #shared, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %11 = triton_gpu.local_load %10 : !triton_gpu.memdesc<32x32xf32, #shared, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
     %12 = tt.dot %11, %cst_0, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x32xf32, #mma>
     %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked>
     tt.store %arg0, %13 : tensor<32x32x!tt.ptr<f32>, #blocked>
@@ -222,22 +222,22 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %7 = tt.expand_dims %6 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked>
     %8 = tt.broadcast %7 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked>
     %9 = tt.addptr %5, %8 : tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<32x128xi32, #blocked>
-    %10 = triton_gpu.local_alloc  : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    %11 = triton_gpu.local_alloc  : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    %10 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    %11 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
     %12 = arith.cmpi slt, %arg0, %arg1 : index
     %13 = tt.splat %12 : i1 -> tensor<128x32xi1, #blocked1>
     %14 = tt.load %4, %13 : tensor<128x32x!tt.ptr<f16>, #blocked1>
     %15 = tt.splat %12 : i1 -> tensor<32x128xi1, #blocked>
     %16 = tt.load %9, %15, %cst_3 : tensor<32x128x!tt.ptr<f16>, #blocked>
-    %17 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %14, %17 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    %18 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %16, %18 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-    %19:6 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %4, %arg7 = %9, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %17, %arg11 = %18) -> (tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+    %17 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %14, %17 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    %18 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %16, %18 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    %19:6 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %4, %arg7 = %9, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %17, %arg11 = %18) -> (tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
       %20 = arith.subi %arg1, %arg2 : index
       %21 = arith.cmpi slt, %arg5, %20 : index
-      %22 = triton_gpu.local_load %arg10 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %23 = triton_gpu.local_load %arg11 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %22 = triton_gpu.local_load %arg10 : !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %23 = triton_gpu.local_load %arg11 : !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %24 = arith.mulf %23, %cst : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %25 = tt.dot %22, %24, %arg8 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma>
       %26 = tt.addptr %arg6, %cst_1 : tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<128x32xi32, #blocked1>
@@ -249,14 +249,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       %32 = arith.addi %arg9, %c1_i32 : i32
       %33 = arith.cmpi slt, %32, %c1_i32 : i32
       %34 = arith.select %33, %32, %c0_i32 : i32
-      %35 = triton_gpu.memdesc_subview %10[%34, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %29, %35 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-      %36 = triton_gpu.memdesc_subview %11[%34, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %31, %36 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-      scf.yield %26, %27, %25, %34, %35, %36 : tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      %35 = triton_gpu.memdesc_subview %10[%34, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %29, %35 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+      %36 = triton_gpu.memdesc_subview %11[%34, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %31, %36 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      scf.yield %26, %27, %25, %34, %35, %36 : tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
     }
-    triton_gpu.local_dealloc %10 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %11 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %10 : !triton_gpu.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %11 : !triton_gpu.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
     tt.return %19#2 : tensor<128x128xf32, #mma>
   }
 
@@ -313,8 +313,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %7 = tt.expand_dims %6 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked>
     %8 = tt.broadcast %7 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked>
     %9 = tt.addptr %5, %8 : tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<32x128xi32, #blocked>
-    %10 = triton_gpu.local_alloc  : () -> !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    %11 = triton_gpu.local_alloc  : () -> !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    %10 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    %11 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
     %12 = arith.cmpi slt, %arg0, %arg1 : index
     %13 = tt.splat %12 : i1 -> tensor<128x32xi1, #blocked1>
     %14 = tt.load %4, %13 : tensor<128x32x!tt.ptr<f16>, #blocked1>
@@ -328,16 +328,16 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %22 = tt.load %19, %21 : tensor<128x32x!tt.ptr<f16>, #blocked1>
     %23 = tt.splat %18 : i1 -> tensor<32x128xi1, #blocked>
     %24 = tt.load %20, %23, %cst_3 : tensor<32x128x!tt.ptr<f16>, #blocked>
-    %25 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %14, %25 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    %26 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %16, %26 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-    %27:8 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %19, %arg7 = %20, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %25, %arg11 = %26, %arg12 = %22, %arg13 = %24) -> (tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked>) {
+    %25 = triton_gpu.memdesc_subview %10[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %14, %25 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    %26 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %16, %26 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    %27:8 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %19, %arg7 = %20, %arg8 = %cst_2, %arg9 = %c0_i32, %arg10 = %25, %arg11 = %26, %arg12 = %22, %arg13 = %24) -> (tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked>) {
       %28 = arith.muli %arg2, %c2 : index
       %29 = arith.subi %arg1, %28 : index
       %30 = arith.cmpi slt, %arg5, %29 : index
-      %31 = triton_gpu.local_load %arg10 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %32 = triton_gpu.local_load %arg11 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %31 = triton_gpu.local_load %arg10 : !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %32 = triton_gpu.local_load %arg11 : !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %33 = arith.mulf %32, %cst : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %34 = tt.dot %31, %33, %arg8 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma>
       %35 = tt.addptr %arg6, %cst_1 : tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<128x32xi32, #blocked1>
@@ -349,14 +349,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       %41 = arith.addi %arg9, %c1_i32 : i32
       %42 = arith.cmpi slt, %41, %c2_i32 : i32
       %43 = arith.select %42, %41, %c0_i32 : i32
-      %44 = triton_gpu.memdesc_subview %10[%43, %c0_i32, %c0_i32] : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %arg12, %44 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-      %45 = triton_gpu.memdesc_subview %11[%43, %c0_i32, %c0_i32] : !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %arg13, %45 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
-      scf.yield %35, %36, %34, %43, %44, %45, %38, %40 : tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked>
+      %44 = triton_gpu.memdesc_subview %10[%43, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %arg12, %44 : tensor<128x32xf16, #blocked1> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+      %45 = triton_gpu.memdesc_subview %11[%43, %c0_i32, %c0_i32] : !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %arg13, %45 : tensor<32x128xf16, #blocked> -> !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      scf.yield %35, %36, %34, %43, %44, %45, %38, %40 : tensor<128x32x!tt.ptr<f16>, #blocked1>, tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, i32, !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>, tensor<128x32xf16, #blocked1>, tensor<32x128xf16, #blocked>
     }
-    triton_gpu.local_dealloc %10 : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %11 : !tt.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %10 : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %11 : !triton_gpu.memdesc<2x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>
     tt.return %27#2 : tensor<128x128xf32, #mma>
   }
 
@@ -404,8 +404,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %c0 = arith.constant 0 : index
     %c1_i32 = arith.constant 1 : i32
     %cst_0 = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-    %0 = triton_gpu.local_alloc  : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-    %1 = triton_gpu.local_alloc  : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    %0 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    %1 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
     %2 = arith.cmpi sgt, %arg1, %c0 : index
     %3 = tt.splat %2 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
     %4 = tt.load %arg3, %3 : tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
@@ -421,17 +421,17 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %14 = tt.load %12, %13 : tensor<16x16x!tt.ptr<f16>, #blocked>
     %15 = tt.splat %5 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
     %16 = tt.load %6, %15 : tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-    %17 = triton_gpu.memdesc_subview %0[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %8, %17 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-    %18 = triton_gpu.memdesc_subview %1[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_store %14, %18 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-    %19:7 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %arg2, %arg9 = %6, %arg10 = %c0_i32, %arg11 = %17, %arg12 = %18, %arg13 = %16) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr<f16>, #blocked1>, tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) {
+    %17 = triton_gpu.memdesc_subview %0[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %8, %17 : tensor<16x16xf16, #blocked1> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    %18 = triton_gpu.memdesc_subview %1[%c0_i32, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_store %14, %18 : tensor<16x16xf16, #blocked> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    %19:7 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %arg2, %arg9 = %6, %arg10 = %c0_i32, %arg11 = %17, %arg12 = %18, %arg13 = %16) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr<f16>, #blocked1>, tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) {
       %20 = arith.subi %arg1, %c2 : index
       %21 = arith.cmpi slt, %arg6, %20 : index
       %22 = arith.subi %arg1, %c1 : index
       %23 = arith.cmpi slt, %arg6, %22 : index
-      %24 = triton_gpu.local_load %arg11 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %25 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %24 = triton_gpu.local_load %arg11 : !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %25 = triton_gpu.local_load %arg12 : !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %26 = tt.dot %24, %25, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma>
       %27 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr<f16>, #blocked1>, tensor<16x16xi32, #blocked1>
       %28 = tt.addptr %arg9, %cst_0 : tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
@@ -448,14 +448,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       %39 = arith.addi %arg10, %c1_i32 : i32
       %40 = arith.cmpi slt, %39, %c1_i32 : i32
       %41 = arith.select %40, %39, %c0_i32 : i32
-      %42 = triton_gpu.memdesc_subview %0[%41, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %30, %42 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-      %43 = triton_gpu.memdesc_subview %1[%41, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %36, %43 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-      scf.yield %26, %27, %28, %41, %42, %43, %38 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr<f16>, #blocked1>, tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+      %42 = triton_gpu.memdesc_subview %0[%41, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %30, %42 : tensor<16x16xf16, #blocked1> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+      %43 = triton_gpu.memdesc_subview %1[%41, %c0_i32, %c0_i32] : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %36, %43 : tensor<16x16xf16, #blocked> -> !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+      scf.yield %26, %27, %28, %41, %42, %43, %38 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr<f16>, #blocked1>, tensor<16x!tt.ptr<i64>, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !triton_gpu.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
     }
-    triton_gpu.local_dealloc %0 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
-    triton_gpu.local_dealloc %1 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
+    triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>
     tt.return %19#0 : tensor<16x16xf32, #mma>
   }
 }
@@ -463,18 +463,18 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // -----
 
 //   CHECK-LABEL: sink_convert_dealloc
-// CHECK-COUNT-2: triton_gpu.local_dealloc %{{.+}} : !tt.memdesc<4x128x64xf16, #shared, mutable>
+// CHECK-COUNT-2: triton_gpu.local_dealloc %{{.+}} : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
 //         CHECK: triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1>
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}>
 #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func public @sink_convert_dealloc(%arg0: tensor<32x32xf32, #blocked>) attributes {noinline = false} {
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable>
-    %1 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+    %1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
     %2 = triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1>
-    triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable>
-    triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable>
+    triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+    triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
     %3 = arith.addf %2, %2 : tensor<32x32xf32, #blocked1>
     tt.return
   }
@@ -490,12 +490,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
 #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func public @anchor_barrier(%arg0: tensor<32x32x!tt.ptr<f16>, #blocked>) attributes {noinline = false} {
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
     gpu.barrier
     %2 = tt.load %arg0 : tensor<32x32x!tt.ptr<f16>, #blocked>
-    %1 = triton_gpu.local_alloc %2 : (tensor<32x32xf16, #blocked>) -> !tt.memdesc<4x128x64xf16, #shared, mutable>
-    triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable>
-    triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable>
+    %1 = triton_gpu.local_alloc %2 : (tensor<32x32xf16, #blocked>) -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+    triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+    triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
     tt.return
   }
 }
diff --git a/test/TritonGPU/amd/amd-sched-2nd-load.mlir b/test/TritonGPU/amd/amd-sched-2nd-load.mlir
index bcf769320e..09c71215f9 100644
--- a/test/TritonGPU/amd/amd-sched-2nd-load.mlir
+++ b/test/TritonGPU/amd/amd-sched-2nd-load.mlir
@@ -30,18 +30,18 @@
 //  CHECK-NEXT: triton_gpu.local_store %[[tileA]]
 //  CHECK-NEXT: triton_gpu.local_store %[[tileB]]
 module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
-  tt.func public @sink_2nd_load_256x256x128(%A_ptr: tensor<256x128x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<128x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+  tt.func public @sink_2nd_load_256x256x128(%A_ptr: tensor<256x128x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<128x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
     %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>)  : i32 {
       %4 = tt.load %A_ptr : tensor<256x128x!tt.ptr<f16>, #blocked>
-      %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0>
+      %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0>
       %5 = tt.load %B_ptr : tensor<128x256x!tt.ptr<f16>, #blocked1>
-      %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x256xf16, #dotOp1>
+      %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x256xf16, #dotOp1>
       %3 = tt.dot %1, %2, %arg1 : tensor<256x128xf16, #dotOp0> * tensor<128x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
-      triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %5, %B_LDS : tensor<128x256xf16, #blocked1> -> !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %5, %B_LDS : tensor<128x256xf16, #blocked1> -> !triton_gpu.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
       scf.yield %3 : tensor<256x256xf32, #mma>
     }
     tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr<f32>, #mma>
@@ -69,18 +69,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
 //  CHECK-NEXT: triton_gpu.local_store %[[tileA]]
 //  CHECK-NEXT: triton_gpu.local_store %[[tileB]]
 module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
-  tt.func public @sink_2nd_load_256x256x64(%A_ptr: tensor<256x64x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<64x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+  tt.func public @sink_2nd_load_256x256x64(%A_ptr: tensor<256x64x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<64x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
     %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>)  : i32 {
       %4 = tt.load %A_ptr : tensor<256x64x!tt.ptr<f16>, #blocked>
-      %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0>
+      %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0>
       %5 = tt.load %B_ptr : tensor<64x256x!tt.ptr<f16>, #blocked1>
-      %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1>
+      %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1>
       %3 = tt.dot %1, %2, %arg1 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
-      triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
       scf.yield %3 : tensor<256x256xf32, #mma>
     }
     tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr<f32>, #mma>
@@ -108,18 +108,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
 //  CHECK-NEXT: triton_gpu.local_store %[[tileA]]
 //  CHECK-NEXT: triton_gpu.local_store %[[tileB]]
 module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
-  tt.func public @sink_2nd_load_256x64x128(%A_ptr: tensor<256x128x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<128x64x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x64x!tt.ptr<f32>, #mma>, %A_LDS: !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+  tt.func public @sink_2nd_load_256x64x128(%A_ptr: tensor<256x128x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<128x64x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x64x!tt.ptr<f32>, #mma>, %A_LDS: !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<256x64xf32, #mma>
     %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x64xf32, #mma>)  : i32 {
       %4 = tt.load %A_ptr : tensor<256x128x!tt.ptr<f16>, #blocked>
-      %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0>
+      %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0>
       %5 = tt.load %B_ptr : tensor<128x64x!tt.ptr<f16>, #blocked1>
-      %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x64xf16, #dotOp1>
+      %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x64xf16, #dotOp1>
       %3 = tt.dot %1, %2, %arg1 : tensor<256x128xf16, #dotOp0> * tensor<128x64xf16, #dotOp1> -> tensor<256x64xf32, #mma>
-      triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %5, %B_LDS : tensor<128x64xf16, #blocked1> -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !triton_gpu.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %5, %B_LDS : tensor<128x64xf16, #blocked1> -> !triton_gpu.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>
       scf.yield %3 : tensor<256x64xf32, #mma>
     }
     tt.store %C_ptr, %0#0: tensor<256x64x!tt.ptr<f32>, #mma>
@@ -147,18 +147,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
 //  CHECK-NEXT: triton_gpu.local_store %[[tileA]]
 //  CHECK-NEXT: triton_gpu.local_store %[[tileB]]
 module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
-  tt.func public @sink_2nd_load_256x256x32(%A_ptr: tensor<256x32x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<32x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+  tt.func public @sink_2nd_load_256x256x32(%A_ptr: tensor<256x32x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<32x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !triton_gpu.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
     %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>)  : i32 {
       %4 = tt.load %A_ptr : tensor<256x32x!tt.ptr<f16>, #blocked>
-      %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x32xf16, #dotOp0>
+      %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x32xf16, #dotOp0>
       %5 = tt.load %B_ptr : tensor<32x256x!tt.ptr<f16>, #blocked1>
-      %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x256xf16, #dotOp1>
+      %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x256xf16, #dotOp1>
       %3 = tt.dot %1, %2, %arg1 : tensor<256x32xf16, #dotOp0> * tensor<32x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
-      triton_gpu.local_store %4, %A_LDS : tensor<256x32xf16, #blocked> -> !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %5, %B_LDS : tensor<32x256xf16, #blocked1> -> !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %4, %A_LDS : tensor<256x32xf16, #blocked> -> !triton_gpu.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %5, %B_LDS : tensor<32x256xf16, #blocked1> -> !triton_gpu.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
       scf.yield %3 : tensor<256x256xf32, #mma>
     }
     tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr<f32>, #mma>
@@ -188,18 +188,18 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
 //  CHECK-NEXT: tt.dot
 //  CHECK-NEXT: triton_gpu.local_store %[[tileA]]
 module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
-  tt.func public @sink_2nd_load_128x128x128_user_before_dot(%A_ptr: tensor<128x128x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<128x128x!tt.ptr<i64>, #blocked>, %B_ptr2: tensor<128x128x!tt.ptr<f16>, #blocked>, %C_ptr: tensor<128x128x!tt.ptr<f32>, #mma>, %A_LDS: !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+  tt.func public @sink_2nd_load_128x128x128_user_before_dot(%A_ptr: tensor<128x128x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<128x128x!tt.ptr<i64>, #blocked>, %B_ptr2: tensor<128x128x!tt.ptr<f16>, #blocked>, %C_ptr: tensor<128x128x!tt.ptr<f32>, #mma>, %A_LDS: !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
     %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<128x128xf32, #mma>)  : i32 {
       %4 = tt.load %A_ptr : tensor<128x128x!tt.ptr<f16>, #blocked>
-      %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp0>
+      %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp0>
       %5 = tt.load %B_ptr : tensor<128x128x!tt.ptr<i64>, #blocked>
-      %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp1>
+      %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp1>
       tt.store %B_ptr, %5 : tensor<128x128x!tt.ptr<i64>, #blocked>
       %3 = tt.dot %1, %2, %arg1 : tensor<128x128xf16, #dotOp0> * tensor<128x128xf16, #dotOp1> -> tensor<128x128xf32, #mma>
-      triton_gpu.local_store %4, %A_LDS : tensor<128x128xf16, #blocked> -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %4, %A_LDS : tensor<128x128xf16, #blocked> -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>
       scf.yield %3 : tensor<128x128xf32, #mma>
     }
     tt.store %C_ptr, %0#0: tensor<128x128x!tt.ptr<f32>, #mma>
@@ -229,19 +229,19 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
 #dotOp0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>
 #dotOp1 = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>
 module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
-  tt.func public @sink_2nd_load_256x256x64_two_dot(%A_ptr: tensor<256x64x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<64x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
+  tt.func public @sink_2nd_load_256x256x64_two_dot(%A_ptr: tensor<256x64x!tt.ptr<f16>, #blocked>, %B_ptr: tensor<64x256x!tt.ptr<f16>, #blocked1>, %C_ptr: tensor<256x256x!tt.ptr<f32>, #mma>, %A_LDS: !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>, %B_LDS: !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>) {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
     %0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>)  : i32 {
       %4 = tt.load %A_ptr : tensor<256x64x!tt.ptr<f16>, #blocked>
       %5 = tt.load %B_ptr : tensor<64x256x!tt.ptr<f16>, #blocked1>
-      %1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0>
-      %2 = triton_gpu.local_load %B_LDS : !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1>
+      %1 = triton_gpu.local_load %A_LDS : !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0>
+      %2 = triton_gpu.local_load %B_LDS : !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1>
       %3 = tt.dot %1, %2, %arg1 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
       %6 = tt.dot %1, %2, %3 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
-      triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
-      triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !triton_gpu.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
+      triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !triton_gpu.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
       scf.yield %3 : tensor<256x256xf32, #mma>
     }
     tt.store %C_ptr, %0#0: tensor<256x256x!tt.ptr<f32>, #mma>
diff --git a/test/TritonGPU/amd/optimize-lds-usage.mlir b/test/TritonGPU/amd/optimize-lds-usage.mlir
index 5cd34aab27..61d1861b29 100644
--- a/test/TritonGPU/amd/optimize-lds-usage.mlir
+++ b/test/TritonGPU/amd/optimize-lds-usage.mlir
@@ -13,9 +13,9 @@
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @alloc_convert_load(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf32, #blocked>) attributes {noinline = false} {
-    %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory>
     %2 = triton_gpu.convert_layout %arg1 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #mma>
-    %3 = triton_gpu.local_load %1 : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+    %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return
   }
 }
@@ -35,9 +35,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @alloc_convert_small_load(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf16, #blocked>) attributes {noinline = false} {
-    %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory>
     %2 = triton_gpu.convert_layout %arg1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #mma>
-    %3 = triton_gpu.local_load %1 : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+    %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return
   }
 }
@@ -57,9 +57,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1, 2], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @alloc_convert_3d_load(%arg0: tensor<1x128x128xf16, #blocked>, %arg1: tensor<1x128x128xf16, #blocked>) attributes {noinline = false} {
-    %1 = triton_gpu.local_alloc %arg0 : (tensor<1x128x128xf16, #blocked>) -> !tt.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_alloc %arg0 : (tensor<1x128x128xf16, #blocked>) -> !triton_gpu.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory>
     %2 = triton_gpu.convert_layout %arg1 : tensor<1x128x128xf16, #blocked> -> tensor<1x128x128xf16, #mma>
-    %3 = triton_gpu.local_load %1 : !tt.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<1x128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+    %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<1x128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<1x128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return
   }
 }
@@ -81,9 +81,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @alloc_convert_32k_limit(%arg0: tensor<64x128xf16, #blocked>, %arg1: tensor<64x128xf16, #blocked>) attributes {noinline = false} {
-    %1 = triton_gpu.local_alloc %arg0 : (tensor<64x128xf16, #blocked>) -> !tt.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_alloc %arg0 : (tensor<64x128xf16, #blocked>) -> !triton_gpu.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory>
     %2 = triton_gpu.convert_layout %arg1 : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #mma>
-    %3 = triton_gpu.local_load %1 : !tt.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, kWidth = 4, parent = #mma}>>
+    %3 = triton_gpu.local_load %1 : !triton_gpu.memdesc<64x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, kWidth = 4, parent = #mma}>>
     tt.return
   }
 }
@@ -98,11 +98,11 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
 // CHECK-DAG: [[SHARED:#[a-z0-9]*]] = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 
 // CHECK: tt.func public @mfma_dot_shortcut([[ARG_0:%[a-z0-9]*]]: {{.*}}, [[ARG_1:%[a-z0-9]*]]: {{.*}}, [[ARG_2:%[a-z0-9]*]]: {{.*}})
-// CHECK: [[ALLOC:%[0-9]+]] = triton_gpu.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !tt.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory>
+// CHECK: [[ALLOC:%[0-9]+]] = triton_gpu.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !triton_gpu.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory>
 // CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = triton_gpu.convert_layout [[ARG_1]] : tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]>
 // CHECK: [[CONVERT_1:%[0-9]+]] = triton_gpu.convert_layout [[INTERMEDIATE_CONV]] : tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]>
 // CHECK: [[CONVERT_2:%[0-9]+]] = triton_gpu.convert_layout [[ARG_2]] : tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>>
-// CHECK: [[LOAD:%[0-9]+]] = triton_gpu.local_load [[ALLOC]] : !tt.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>>
+// CHECK: [[LOAD:%[0-9]+]] = triton_gpu.local_load [[ALLOC]] : !triton_gpu.memdesc<128x128xf16, [[SHARED]], #triton_gpu.shared_memory> -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>>
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma1 = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
 #mma2 = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}>
@@ -111,10 +111,10 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @mfma_dot_shortcut(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf32, #blocked>, %arg2: tensor<256x128xf16, #mma2>) attributes {noinline = false} {
-    %alloc = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory>
+    %alloc = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory>
     %convert_1 = triton_gpu.convert_layout %arg1 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #mma1>
     %convert_2 = triton_gpu.convert_layout %arg2 : tensor<256x128xf16, #mma2> -> tensor<256x128xf16, #dotop2>
-    %load = triton_gpu.local_load %alloc : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #dotop1>
+    %load = triton_gpu.local_load %alloc : !triton_gpu.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf16, #dotop1>
     tt.return
   }
 }
@@ -131,9 +131,9 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   tt.func public @convert_1d(%arg0: tensor<128xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>, %arg1: tensor<128x128xf32, #mma>) attributes {noinline = false} {
-    %alloc = triton_gpu.local_alloc %arg1 : (tensor<128x128xf32, #mma>) -> !tt.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory>
+    %alloc = triton_gpu.local_alloc %arg1 : (tensor<128x128xf32, #mma>) -> !triton_gpu.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory>
     %1 = triton_gpu.convert_layout %arg0 : tensor<128xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> -> tensor<128xf32, #blocked>
-    %load = triton_gpu.local_load %alloc : !tt.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf32, #mma>
+    %load = triton_gpu.local_load %alloc : !triton_gpu.memdesc<128x128xf32, #shared, #triton_gpu.shared_memory> -> tensor<128x128xf32, #mma>
     tt.return
   }
 }
diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
index 9422bb0f85..64385d9297 100644
--- a/test/TritonGPU/canonicalize.mlir
+++ b/test/TritonGPU/canonicalize.mlir
@@ -71,8 +71,8 @@ tt.func @test_canonicalize_convert_histogram(%arg0: tensor<256xi32, #blocked1>)
 #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.compute-capability" = 80} {
 tt.func @test_canonicalize_convert_local_load() -> tensor<256xi32, #blocked1> {
-    %0 = triton_gpu.local_alloc  : () -> !tt.memdesc<256xi32, #shared, mutable>
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<256xi32, #shared, mutable> -> tensor<256xi32, #blocked>
+    %0 = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<256xi32, #shared, mutable>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<256xi32, #shared, mutable> -> tensor<256xi32, #blocked>
     gpu.barrier
     %2 = triton_gpu.convert_layout %1 : tensor<256xi32, #blocked> -> tensor<256xi32, #blocked1>
     tt.return %2 : tensor<256xi32, #blocked1>
@@ -85,15 +85,15 @@ tt.func @test_canonicalize_convert_local_load() -> tensor<256xi32, #blocked1> {
 #shared = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: local_alloc_nofold1
-  tt.func @local_alloc_nofold1(%arg0: tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> {
+  tt.func @local_alloc_nofold1(%arg0: tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> {
     // CHECK: %[[ARG:.+]] = triton_gpu.local_alloc
     // CHECK-NEXT: %[[ARG2:.+]] = triton_gpu.local_load %[[ARG]]
     // CHECK-NEXT: %[[ARG3:.+]] = triton_gpu.local_alloc %[[ARG2]]
     // CHECK-NEXT: tt.return %[[ARG3]]
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #blocked>
-    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
-    tt.return %2 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #blocked>
+    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    tt.return %2 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
   }
 }  // end module
 
@@ -105,15 +105,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 #shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: local_alloc_nofold2
-  tt.func @local_alloc_nofold2(%arg0: tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> {
+  tt.func @local_alloc_nofold2(%arg0: tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory> {
     // CHECK: %[[ARG:.+]] = triton_gpu.local_alloc
     // CHECK-NEXT: %[[ARG2:.+]] = triton_gpu.local_load %[[ARG]]
     // CHECK-NEXT: %[[ARG3:.+]] = triton_gpu.local_alloc %[[ARG2]]
     // CHECK-NEXT: tt.return %[[ARG3]]
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked>
-    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory>
-    tt.return %2 : !tt.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked>
+    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory>
+    tt.return %2 : !triton_gpu.memdesc<16x16xf16, #shared1, #triton_gpu.shared_memory>
   }
 }  // end module
 
@@ -123,13 +123,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #shared = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
-  tt.func @local_alloc_fold(%arg0: tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> {
+  tt.func @local_alloc_fold(%arg0: tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> {
     // CHECK-LABEL: local_alloc_fold
     // CHECK-NEXT: %[[ARG:.+]] = triton_gpu.local_alloc
     // CHECK-NEXT: tt.return %[[ARG]]
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
-    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked>
-    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
-    tt.return %2 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %1 = triton_gpu.local_load %0 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #blocked>
+    %2 = triton_gpu.local_alloc %1 : (tensor<16x16xf16, #blocked>) -> !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    tt.return %2 : !triton_gpu.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
   }
 }  // end module
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
index 2ec11a24f1..5e1cad52af 100644
--- a/test/TritonGPU/combine.mlir
+++ b/test/TritonGPU/combine.mlir
@@ -1561,9 +1561,9 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
     %20 = triton_gpu.convert_layout %16 : tensor<32x32x!tt.ptr<f16>, #blocked> -> tensor<32x32x!tt.ptr<f16>, #blocked4>
     %21 = tt.load %20 : tensor<32x32x!tt.ptr<f16>, #blocked4>
     %22 = triton_gpu.convert_layout %21 : tensor<32x32xf16, #blocked4> -> tensor<32x32xf16, #blocked>
-    %23 = triton_gpu.local_alloc %22 : (tensor<32x32xf16, #blocked>) -> !tt.memdesc<32x32xf16, #shared>
-    %24 = triton_gpu.memdesc_trans %23 {order=array<i32: 1,0>} : !tt.memdesc<32x32xf16, #shared> -> !tt.memdesc<32x32xf16, #shared1>
-    %25 = triton_gpu.local_load %24 : !tt.memdesc<32x32xf16, #shared1> -> tensor<32x32xf16, #blocked>
+    %23 = triton_gpu.local_alloc %22 : (tensor<32x32xf16, #blocked>) -> !triton_gpu.memdesc<32x32xf16, #shared>
+    %24 = triton_gpu.memdesc_trans %23 {order=array<i32: 1,0>} : !triton_gpu.memdesc<32x32xf16, #shared> -> !triton_gpu.memdesc<32x32xf16, #shared1>
+    %25 = triton_gpu.local_load %24 : !triton_gpu.memdesc<32x32xf16, #shared1> -> tensor<32x32xf16, #blocked>
     %26 = triton_gpu.convert_layout %19 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked5}>>
     %27 = triton_gpu.convert_layout %25 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked5}>>
     %28 = triton_gpu.convert_layout %cst : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked5>
@@ -1993,10 +1993,10 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 :
       %67 = tt.load %66 : tensor<32x64x!tt.ptr<f16>, #blocked>
       %68 = tt.addptr %17, %65 : tensor<256x64x!tt.ptr<f16>, #blocked>, tensor<256x64xi32, #blocked>
       %69 = tt.load %68 : tensor<256x64x!tt.ptr<f16>, #blocked>
-      %70 = triton_gpu.local_alloc %69 : (tensor<256x64xf16, #blocked>) -> !tt.memdesc<256x64xf16, #shared>
-      %71 = triton_gpu.memdesc_trans %70 {order=array<i32: 1,0>} : !tt.memdesc<256x64xf16, #shared> -> !tt.memdesc<64x256xf16, #shared1>
+      %70 = triton_gpu.local_alloc %69 : (tensor<256x64xf16, #blocked>) -> !triton_gpu.memdesc<256x64xf16, #shared>
+      %71 = triton_gpu.memdesc_trans %70 {order=array<i32: 1,0>} : !triton_gpu.memdesc<256x64xf16, #shared> -> !triton_gpu.memdesc<64x256xf16, #shared1>
       %72 = triton_gpu.convert_layout %67 : tensor<32x64xf16, #blocked> -> tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked3}>>
-      %73 = triton_gpu.local_load %71 : !tt.memdesc<64x256xf16, #shared1> -> tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked3}>>
+      %73 = triton_gpu.local_load %71 : !triton_gpu.memdesc<64x256xf16, #shared1> -> tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked3}>>
       %74 = triton_gpu.convert_layout %arg8 : tensor<32x256xf32, #blocked3> -> tensor<32x256xf32, #mma>
       %75 = triton_gpu.convert_layout %72 : tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked3}>> -> tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
       %76 = triton_gpu.convert_layout %73 : tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked3}>> -> tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir
index 2d562b9587..911cf4fb40 100644
--- a/test/TritonGPU/dot-operands.mlir
+++ b/test/TritonGPU/dot-operands.mlir
@@ -162,10 +162,10 @@ tt.func @update_kwidth_slice(
 module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 // CHECK: tt.func @mma_v3_reg_operand_A
 //    CHECK: %[[A:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-//    CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
-tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: !tt.memdesc<64x64xf16, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
-  %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf16, #mma>) -> !tt.memdesc<128x64xf16, #shared1>
-  %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !tt.memdesc<128x64xf16, #shared1> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+//    CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: !triton_gpu.memdesc<64x64xf16, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
+  %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf16, #mma>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
+  %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
   tt.return %r : tensor<128x64xf32, #mma>
 }
 }
@@ -178,10 +178,10 @@ tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: !tt.memdes
 module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 // CHECK: tt.func @mma_v3_reg_operand_A_fp8
 //    CHECK: %[[A:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<128x64xf8E5M2, #mma> -> tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
-//    CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !tt.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma>
-tt.func @mma_v3_reg_operand_A_fp8(%arg0: tensor<128x64xf8E5M2, #mma>, %arg1: !tt.memdesc<64x64xf8E5M2, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
-  %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf8E5M2, #mma>) -> !tt.memdesc<128x64xf8E5M2, #shared1>
-  %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !tt.memdesc<128x64xf8E5M2, #shared1> * !tt.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma>
+//    CHECK: triton_nvidia_gpu.warp_group_dot %[[A]], {{.*}} : tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * !triton_gpu.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma>
+tt.func @mma_v3_reg_operand_A_fp8(%arg0: tensor<128x64xf8E5M2, #mma>, %arg1: !triton_gpu.memdesc<64x64xf8E5M2, #shared>, %arg2: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
+  %A = triton_gpu.local_alloc %arg0 : (tensor<128x64xf8E5M2, #mma>) -> !triton_gpu.memdesc<128x64xf8E5M2, #shared1>
+  %r = triton_nvidia_gpu.warp_group_dot %A, %arg1, %arg2 : !triton_gpu.memdesc<128x64xf8E5M2, #shared1> * !triton_gpu.memdesc<64x64xf8E5M2, #shared> -> tensor<128x64xf32, #mma>
   tt.return %r : tensor<128x64xf32, #mma>
 }
 }
@@ -220,12 +220,12 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
 //    CHECK: %[[A_BLOCK:.*]] = tt.load %{{.*}} : tensor<128x64x!tt.ptr<bf16>, #blocked>
 //    CHECK: %[[A_DOTOP:.*]] = triton_gpu.convert_layout %[[A_BLOCK]] : tensor<128x64xbf16, #blocked> -> tensor<128x64xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
 //    CHECK: %[[A_CASTED:.*]] = tt.fp_to_fp %[[A_DOTOP]] : tensor<128x64xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-//    CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_CASTED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
-  tt.func @mma_v3_reg_push_elementwise(%pa: tensor<128x64x!tt.ptr<bf16>, #blocked>, %dotb: !tt.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
+//    CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_CASTED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+  tt.func @mma_v3_reg_push_elementwise(%pa: tensor<128x64x!tt.ptr<bf16>, #blocked>, %dotb: !triton_gpu.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
     %a_bf16 = tt.load %pa : tensor<128x64x!tt.ptr<bf16>, #blocked>
     %a = tt.fp_to_fp %a_bf16 : tensor<128x64xbf16, #blocked> -> tensor<128x64xf16, #blocked>
-    %dota = triton_gpu.local_alloc %a: (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared1>
-    %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !tt.memdesc<128x64xf16, #shared1> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+    %dota = triton_gpu.local_alloc %a: (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
+    %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
     tt.return %r : tensor<128x64xf32, #mma>
   }
 }
@@ -244,15 +244,15 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
 //    CHECK: %[[A_CASTED:.*]] = arith.sitofp %[[A_DOTOP]] : tensor<128x64xi8, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> to tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
 //    CHECK: %[[A_SCALED:.*]] = arith.mulf %[[A_CASTED]], %[[CST_DOTOP]] : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
 //    CHECK: %[[A_NEGATED:.*]] = arith.negf %[[A_SCALED]] : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-//    CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_NEGATED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
-  tt.func @mma_v3_reg_push_elementwise_chained(%pa: tensor<128x64x!tt.ptr<i8>, #blocked>, %dotb: !tt.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
+//    CHECK: %[[R:.*]] = triton_nvidia_gpu.warp_group_dot %[[A_NEGATED]], %{{.*}}, %{{.*}} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+  tt.func @mma_v3_reg_push_elementwise_chained(%pa: tensor<128x64x!tt.ptr<i8>, #blocked>, %dotb: !triton_gpu.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
     %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked>
     %a_i8 = tt.load %pa : tensor<128x64x!tt.ptr<i8>, #blocked>
     %a_f16 = arith.sitofp %a_i8 : tensor<128x64xi8, #blocked> to tensor<128x64xf16, #blocked>
     %a_scaled = arith.mulf %a_f16, %cst : tensor<128x64xf16, #blocked>
     %a_negated = arith.negf %a_scaled : tensor<128x64xf16, #blocked>
-    %dota = triton_gpu.local_alloc %a_negated: (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared1>
-    %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !tt.memdesc<128x64xf16, #shared1> * !tt.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+    %dota = triton_gpu.local_alloc %a_negated: (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
+    %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
     tt.return %r : tensor<128x64xf32, #mma>
   }
 }
diff --git a/test/TritonGPU/fence-inserstion.mlir b/test/TritonGPU/fence-inserstion.mlir
index 9ed3646d92..f83acb21f1 100644
--- a/test/TritonGPU/fence-inserstion.mlir
+++ b/test/TritonGPU/fence-inserstion.mlir
@@ -9,10 +9,10 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
   // CHECK-LABEL: matmul_like_fence
   tt.func public @matmul_like_fence(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x64xf16, #blocked2>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared>
-    %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked2>) -> !tt.memdesc<128x64xf16, #shared1>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared>
+    %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked2>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
     // CHECK: triton_nvidia_gpu.fence_async_shared
-    %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !tt.memdesc<128x128xf16, #shared> * !tt.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma>
+    %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !triton_gpu.memdesc<128x128xf16, #shared> * !triton_gpu.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma>
     tt.return
   }
 }
@@ -31,15 +31,15 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     %c64_i32 = arith.constant 64 : i32
     %c0_i32 = arith.constant 0 : i32
     %c32_i32 = arith.constant 32 : i32
-    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared>
-    %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared1>
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared>
+    %1 = triton_gpu.local_alloc %arg1 : (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
     // CHECK: triton_nvidia_gpu.fence_async_shared
     // CHECK: scf.for
     // CHECK-NOT: triton_nvidia_gpu.fence_async_shared
     // CHECK:   triton_nvidia_gpu.warp_group_dot
     scf.for %iv0 = %c0_i32 to %c64_i32 step %c32_i32 : i32 {
       scf.for %iv1 = %c0_i32 to %c64_i32 step %c32_i32 : i32 {
-        %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !tt.memdesc<128x128xf16, #shared> * !tt.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma>
+        %2 = triton_nvidia_gpu.warp_group_dot %0, %1, %cst : !triton_gpu.memdesc<128x128xf16, #shared> * !triton_gpu.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma>
       }
     }
     tt.return
diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir
index f9e265f3ee..5a91a3cc0c 100644
--- a/test/TritonGPU/invalid.mlir
+++ b/test/TritonGPU/invalid.mlir
@@ -1,45 +1,45 @@
 // RUN: triton-opt --split-input-file %s --verify-diagnostics
 
-tt.func public @subview_element_ty(%arg0: !tt.memdesc<8x16xf32>) {
+tt.func public @subview_element_ty(%arg0: !triton_gpu.memdesc<8x16xf32>) {
     %zero = arith.constant 0 : i32
     // expected-error @+1 {{element type}}
-    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<8x16xf16>
+    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<8x16xf16>
     tt.return
 }
 
 // -----
 
-tt.func public @too_many_offsets(%arg0: !tt.memdesc<8x16xf32>) {
+tt.func public @too_many_offsets(%arg0: !triton_gpu.memdesc<8x16xf32>) {
     %zero = arith.constant 0 : i32
     // expected-error @+1 {{offsets}}
-    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<f32>
+    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<f32>
     tt.return
 }
 
 // -----
 
-tt.func public @too_few_offsets(%arg0: !tt.memdesc<8x16xf32>) {
+tt.func public @too_few_offsets(%arg0: !triton_gpu.memdesc<8x16xf32>) {
     %zero = arith.constant 0 : i32
     // expected-error @+1 {{offsets}}
-    %a = triton_gpu.memdesc_subview %arg0[%zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<f32>
+    %a = triton_gpu.memdesc_subview %arg0[%zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<f32>
     tt.return
 }
 
 // -----
 
-tt.func public @result_rank_too_large(%arg0: !tt.memdesc<8x16xf32>) {
+tt.func public @result_rank_too_large(%arg0: !triton_gpu.memdesc<8x16xf32>) {
     %zero = arith.constant 0 : i32
     // expected-error @+1 {{result rank}}
-    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<3x8x16xf32>
+    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<3x8x16xf32>
     tt.return
 }
 
 // -----
 
-tt.func public @result_dim_too_large(%arg0: !tt.memdesc<8x16xf32>) {
+tt.func public @result_dim_too_large(%arg0: !triton_gpu.memdesc<8x16xf32>) {
     %zero = arith.constant 0 : i32
     // expected-error @+1 {{result shape}}
-    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !tt.memdesc<8x16xf32> -> !tt.memdesc<32xf32>
+    %a = triton_gpu.memdesc_subview %arg0[%zero, %zero] : !triton_gpu.memdesc<8x16xf32> -> !triton_gpu.memdesc<32xf32>
     tt.return
 }
 
diff --git a/test/TritonGPU/loop-pipeline-cuda.mlir b/test/TritonGPU/loop-pipeline-cuda.mlir
index 3cb8511b0b..fe8f45e92f 100644
--- a/test/TritonGPU/loop-pipeline-cuda.mlir
+++ b/test/TritonGPU/loop-pipeline-cuda.mlir
@@ -49,9 +49,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma>
       %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma>
       %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory>
-      %25 = triton_gpu.memdesc_trans %24 {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory>
-      %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory>
+      %25 = triton_gpu.memdesc_trans %24 {order=array<i32: 1,0>} : !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory>
+      %26 = triton_gpu.local_load %25 : !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma>
       scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>
     }
@@ -140,9 +140,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>)  : i32 {
       %70 = tt.load %59 : tensor<32x64x!tt.ptr<f32>, #blocked1>
       %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
-      %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory>
-      %73 = triton_gpu.memdesc_trans %72 {order=array<i32: 1,0>} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory>
-      %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+      %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory>
+      %73 = triton_gpu.memdesc_trans %72 {order=array<i32: 1,0>} : !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory>
+      %74 = triton_gpu.local_load %73 : !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
       %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma>
       %76 = tt.load %61 : tensor<32x32x!tt.ptr<f32>, #blocked1>
       %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
@@ -169,9 +169,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
 //   CHECK-LABEL: @matmul_tma
-//     CHECK-DAG:   triton_gpu.local_alloc  : () -> !tt.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable>
-//     CHECK-DAG:   triton_gpu.local_alloc  : () -> !tt.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable>
-//     CHECK-DAG:   triton_gpu.local_alloc  : () -> !tt.memdesc<3xi64, #{{.+}}, #triton_gpu.shared_memory, mutable>
+//     CHECK-DAG:   triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable>
+//     CHECK-DAG:   triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable>
+//     CHECK-DAG:   triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<3xi64, #{{.+}}, #triton_gpu.shared_memory, mutable>
 // CHECK-COUNT-3:   triton_nvidia_gpu.init_barrier
 // CHECK-COUNT-4:   triton_nvidia_gpu.async_tma_copy_global_to_local
 //         CHECK:   scf.for
@@ -187,10 +187,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
     %0:2 = scf.for %arg3 = %c0_i32 to %c256_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<128x256xf32, #mma>, i32)  : i32 {
       %1 = tt.experimental_descriptor_load %arg0[%c0_i32, %arg5] : !tt.tensordesc<tensor<128x64xf16>> -> tensor<128x64xf16, #blocked>
-      %2 = triton_gpu.local_alloc %1 : (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+      %2 = triton_gpu.local_alloc %1 : (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
       %3 = tt.experimental_descriptor_load %arg1[%arg5, %c0_i32] : !tt.tensordesc<tensor<64x256xf16>> -> tensor<64x256xf16, #blocked1>
-      %4 = triton_gpu.local_alloc %3 : (tensor<64x256xf16, #blocked1>) -> !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory>
-      %5 = triton_nvidia_gpu.warp_group_dot %2, %4, %arg4 { inputPrecision = 0 : i32 } : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma>
+      %4 = triton_gpu.local_alloc %3 : (tensor<64x256xf16, #blocked1>) -> !triton_gpu.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory>
+      %5 = triton_nvidia_gpu.warp_group_dot %2, %4, %arg4 { inputPrecision = 0 : i32 } : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma>
       %6 = arith.addi %arg5, %c64_i32 : i32
       scf.yield %5, %6 : tensor<128x256xf32, #mma>, i32
     }
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
index 4ad94615c8..6ca0897578 100644
--- a/test/TritonGPU/loop-pipeline-hip.mlir
+++ b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -47,9 +47,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1
       %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma>
       %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma>
       %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable>
-      %25 = triton_gpu.memdesc_trans %24 {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable>
-      %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable>
+      %25 = triton_gpu.memdesc_trans %24 {order=array<i32: 1,0>} : !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable>
+      %26 = triton_gpu.local_load %25 : !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma>
       scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>
     }
@@ -139,9 +139,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1
     %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>)  : i32 {
       %70 = tt.load %59 : tensor<32x64x!tt.ptr<f32>, #blocked1>
       %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
-      %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>
-      %73 = triton_gpu.memdesc_trans %72 {order=array<i32: 1,0>} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable>
-      %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+      %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>
+      %73 = triton_gpu.memdesc_trans %72 {order=array<i32: 1,0>} : !triton_gpu.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable>
+      %74 = triton_gpu.local_load %73 : !triton_gpu.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
       %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma>
       %76 = tt.load %61 : tensor<32x32x!tt.ptr<f32>, #blocked1>
       %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
@@ -252,9 +252,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
     %0 = scf.for %arg2 = %c0_i32 to %arg1 step %c1_i32 iter_args(%arg3 = %cst) -> (tensor<32x32xf32, #blocked>)  : i32 {
       %2 = tt.load %arg4 : tensor<32x32x!tt.ptr<f32>, #blocked1>
-      %3 = triton_gpu.local_alloc %2 : (tensor<32x32xf32, #blocked1>) -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory>
-      %4 = triton_gpu.memdesc_trans %3 {order = array<i32: 1, 0>} : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory>
-      %5 = triton_gpu.local_load %4 : !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
+      %3 = triton_gpu.local_alloc %2 : (tensor<32x32xf32, #blocked1>) -> !triton_gpu.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory>
+      %4 = triton_gpu.memdesc_trans %3 {order = array<i32: 1, 0>} : !triton_gpu.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory>
+      %5 = triton_gpu.local_load %4 : !triton_gpu.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
       %6 = triton_gpu.convert_layout %2 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>
       %7 = tt.dot %6, %5, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<32x32xf32, #blocked>
       scf.yield %7 : tensor<32x32xf32, #blocked>
diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir
index f3784fbe8c..d358be4d97 100644
--- a/test/TritonGPU/loop-pipeline-hopper.mlir
+++ b/test/TritonGPU/loop-pipeline-hopper.mlir
@@ -19,7 +19,7 @@
 // CHECK: %[[BBUFFER:.*]] = triton_gpu.local_alloc
 // CHECK-DAG: %[[LOOP_COND_0:.*]] = arith.cmpi slt, %[[LB:.*]], %[[UB:.*]]
 // CHECK-DAG: %[[LOOP_COND_0_SPLAT_A:.*]] = tt.splat %[[LOOP_COND_0]]
-// CHECK-DAG: %[[ASUB:.*]] = triton_gpu.memdesc_subview %[[ABUFFER]][%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]] : !tt.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
+// CHECK-DAG: %[[ASUB:.*]] = triton_gpu.memdesc_subview %[[ABUFFER]][%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]] : !triton_gpu.memdesc<2x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
 // CHECK: %[[T_A0:.*]] = triton_gpu.async_copy_global_to_local %{{.*}}, %[[ASUB]] mask %[[LOOP_COND_0_SPLAT_A]] : tensor<128x32x!tt.ptr<f16>, #blocked1> -> <128x32xf16, #shared, #triton_gpu.shared_memory, mutable>
 // CHECK-DAG: %[[LOOP_COND_0_SPLAT_B:.*]] = tt.splat %[[LOOP_COND_0]]
 // CHECK-DAG: %[[BSUB:.*]] = triton_gpu.memdesc_subview %[[BBUFFER]][%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]]
@@ -333,8 +333,8 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //      %a = tt.load %a_tileptr  : !tt.ptr<tensor<128x32xf16>, 1>
 //      %b = tt.load %b_tileptr  : !tt.ptr<tensor<32x128xf16>, 1>
 //
-//      %sa = triton_gpu.local_alloc %a : (tensor<128x32xf16, #BA>) -> !tt.memdesc<128x32xf16, #SA, #triton_gpu.shared_memory>
-//      %sb = triton_gpu.local_alloc %b : (tensor<32x128xf16, #BB>) -> !tt.memdesc<32x128xf16, #SB, #triton_gpu.shared_memory>
+//      %sa = triton_gpu.local_alloc %a : (tensor<128x32xf16, #BA>) -> !triton_gpu.memdesc<128x32xf16, #SA, #triton_gpu.shared_memory>
+//      %sb = triton_gpu.local_alloc %b : (tensor<32x128xf16, #BB>) -> !triton_gpu.memdesc<32x128xf16, #SB, #triton_gpu.shared_memory>
 //      %c = triton_nvidia_gpu.warp_group_dot %sa, %sb, %prev_c : tensor<128x32xf16, #SA> * tensor<32x128xf16, #SB> -> tensor<128x128xf32, #C>
 //
 //      %a_tileptr_next = tt.advance %a_tileptr, [%c0, %c32_i32] : !tt.ptr<tensor<128x32xf16>, 1>
@@ -393,13 +393,13 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     // CHECK:   scf.yield
     %17:2 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_3, %arg5 = %16) -> (tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr<f16>, #blocked>)  : i32 {
       %18 = tt.load %arg5 : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
-      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %cst_2 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %cst_2 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %22 = arith.truncf %21 : tensor<128x16xf32, #mma1> to tensor<128x16xf16, #mma1>
-      %23 = triton_gpu.memdesc_trans %20 {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory>
+      %23 = triton_gpu.memdesc_trans %20 {order=array<i32: 1,0>} : !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory>
       %24 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma1> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>>
-      %25 = triton_nvidia_gpu.warp_group_dot %24, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma>
+      %25 = triton_nvidia_gpu.warp_group_dot %24, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma>
       %26 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
       scf.yield %25, %26 : tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr<f16>, #blocked>
     }
@@ -445,9 +445,9 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<128x64x!tt.ptr<f16>, #blocked1>)  : i32 {
       %9 = tt.load %arg6 : tensor<128x64x!tt.ptr<f16>, #blocked1>
       %18 = tt.load %arg5 : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
-      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %cnd = arith.cmpi slt, %arg3, %ext : i32
       %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1>
@@ -502,24 +502,24 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
     %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
     %18 = tt.load %16 : tensor<64x16x!tt.ptr<f16>, #blocked>
-    %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
-    %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+    %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+    %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
     // CHECK: %[[ALLOC1:.+]] = triton_gpu.local_alloc
     // CHECK: %[[ALLOC2:.+]] = triton_gpu.local_alloc
     // CHECK: %[[R:.+]]:{{.+}} = scf.for
     // CHECK:   %[[DOT1:.+]] = triton_nvidia_gpu.warp_group_dot{{.*}}
     // CHECK:   triton_gpu.async_wait {{.*}} {num = 1 : i32}
-    // CHECK:   %[[TRANS:.+]] = triton_gpu.memdesc_trans{{.*}} : !tt.memdesc
+    // CHECK:   %[[TRANS:.+]] = triton_gpu.memdesc_trans{{.*}} : !triton_gpu.memdesc
     // CHECK:   %[[DOT2:.+]] = triton_nvidia_gpu.warp_group_dot{{.*}} %[[TRANS]]
     // CHECK:   triton_nvidia_gpu.warp_group_dot_wait %[[DOT1]], %[[DOT2]], %[[ALLOC1]], %[[ALLOC2]], %[[TRANS]] {pendings = 2 : i32}
     // CHECK:   scf.yield
     // CHECK: %{{.*}}:2 = triton_nvidia_gpu.warp_group_dot_wait %[[R]]#{{.+}}, %[[R]]#{{.+}} {pendings = 0 : i32} : tensor<128x16xf32, #{{.*}}>, tensor<128x64xf32, #{{.*}}>
     %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_3, %arg5 = %16, %arg6 = %cst_2) -> (tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<128x16xf32, #mma1>)  : i32 {
-      %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %arg6 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %21 = triton_nvidia_gpu.warp_group_dot %19, %20, %arg6 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %l = tt.load %arg5 : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %23 = triton_gpu.memdesc_trans %c {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory>
-      %25 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma>
+      %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %23 = triton_gpu.memdesc_trans %c {order=array<i32: 1,0>} : !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory>
+      %25 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma>
       %26 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
       scf.yield %25, %26, %21 : tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<128x16xf32, #mma1>
     }
@@ -577,13 +577,13 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     %22:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %12, %arg6 = %21) -> (tensor<128x256xf32, #mma>, tensor<128x64x!tt.ptr<f8E5M2>, #blocked>, tensor<64x256x!tt.ptr<f8E5M2>, #blocked1>)  : i32 {
       %35 = tt.load %arg5 : tensor<128x64x!tt.ptr<f8E5M2>, #blocked>
       %36 = tt.load %arg6 : tensor<64x256x!tt.ptr<f8E5M2>, #blocked1>
-      %37 = triton_gpu.local_alloc %35 : (tensor<128x64xf8E5M2, #blocked>) -> !tt.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory>
-      %38 = triton_gpu.local_alloc %36 : (tensor<64x256xf8E5M2, #blocked1>) -> !tt.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory>
+      %37 = triton_gpu.local_alloc %35 : (tensor<128x64xf8E5M2, #blocked>) -> !triton_gpu.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory>
+      %38 = triton_gpu.local_alloc %36 : (tensor<64x256xf8E5M2, #blocked1>) -> !triton_gpu.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory>
       // CHECK: triton_gpu.local_alloc
       // CHECK: scf.for
       // CHECK:   triton_nvidia_gpu.warp_group_dot
       // CHECK-NEXT: triton_nvidia_gpu.warp_group_dot_wait
-      %39 = triton_nvidia_gpu.warp_group_dot %37, %38, %arg4 {maxNumImpreciseAcc = 1073741824 : i32} : !tt.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma>
+      %39 = triton_nvidia_gpu.warp_group_dot %37, %38, %arg4 {maxNumImpreciseAcc = 1073741824 : i32} : !triton_gpu.memdesc<128x64xf8E5M2, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x256xf8E5M2, #shared1, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma>
       %40 = tt.addptr %arg5, %cst_6 : tensor<128x64x!tt.ptr<f8E5M2>, #blocked>, tensor<128x64xi32, #blocked>
       %41 = tt.addptr %arg6, %cst_5 : tensor<64x256x!tt.ptr<f8E5M2>, #blocked1>, tensor<64x256xi32, #blocked1>
       scf.yield %39, %40, %41 : tensor<128x256xf32, #mma>, tensor<128x64x!tt.ptr<f8E5M2>, #blocked>, tensor<64x256x!tt.ptr<f8E5M2>, #blocked1>
@@ -657,8 +657,8 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
     %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
     %18 = tt.load %16 : tensor<64x16x!tt.ptr<f16>, #blocked>
-    %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
-    %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+    %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+    %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
     // CHECK:          %[[LOOP:[^ :]+]]{{.*}} scf.for {{.*}} iter_args(%[[PREV_DOT2:[^ ]+]]
     // CHECK-NOT:        triton_nvidia_gpu.warp_group_dot_wait
     // CHECK:            %[[DOT0:.+]] = triton_nvidia_gpu.warp_group_dot
@@ -675,17 +675,17 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     // CHECK:          triton_nvidia_gpu.warp_group_dot_wait %[[LOOP]]#3, %[[LOOP]]#0 {pendings = 0 : i32}
     %17:4 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%prev_dot2 = %cst_3, %arg5 = %16, %prev_dot1 = %cst_2, %prev_dot0 = %cst_2) -> (tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>)  : i32 {
       // This one can be async.
-      %dot0 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %dot0 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       // This can't be async because its result is modified before it's yielded.
-      %dot1 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %dot1 = triton_nvidia_gpu.warp_group_dot %19, %20, %prev_dot1 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %dot1.1 = arith.addf %dot1, %dot1 : tensor<128x16xf32, #mma1>
       %l = tt.load %arg5 : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %23 = triton_gpu.memdesc_trans %c {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory>
+      %c = triton_gpu.local_alloc %l : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %23 = triton_gpu.memdesc_trans %c {order=array<i32: 1,0>} : !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory>
       // This dot can be async even though %prev_dot2 is not used directly by an
       // async dot, because that use follows the synchronous dot above.
       %prev_dot2.1 = arith.addf %prev_dot2, %prev_dot2 : tensor<128x64xf32, #mma>
-      %dot2 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %prev_dot2.1 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !tt.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma>
+      %dot2 = triton_nvidia_gpu.warp_group_dot %cst_4, %23, %prev_dot2.1 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !triton_gpu.memdesc<16x64xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x64xf32, #mma>
       %26 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
       scf.yield %dot2, %26, %dot1.1, %dot0 : tensor<128x64xf32, #mma>, tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>
     }
@@ -719,7 +719,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   // CHECK-LABEL: tma_multiple_store_pipeline
   tt.func public @tma_multiple_store_pipeline(%arg0: tensor<1xf32, #blocked>, %arg1: !tt.tensordesc<tensor<1xf32>>, %arg2: i32, %arg3: i32) attributes {noinline = false} {
     %c0_i32 = arith.constant 0 : i32
-    // CHECK: %[[ALLOC:.+]] = triton_gpu.local_alloc  : () -> !tt.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
+    // CHECK: %[[ALLOC:.+]] = triton_gpu.local_alloc  : () -> !triton_gpu.memdesc<1xf32, #shared, #triton_gpu.shared_memory, mutable>
     // CHECK: scf.for
     scf.for %arg4 = %c0_i32 to %arg3 step %arg2  : i32 {
       %1 = arith.divsi %arg4, %arg2 : i32
@@ -780,10 +780,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
       %11 = tt.broadcast %10 : tensor<128x1xi32, #blocked1> -> tensor<128x128xi32, #blocked1>
       %12 = tt.addptr %1, %11 : tensor<128x128x!tt.ptr<f8E4M3FNUZ>, #blocked1>, tensor<128x128xi32, #blocked1>
       %13 = tt.load %arg0 : tensor<128x128x!tt.ptr<f8E4M3FNUZ>, #blocked>
-      %14 = triton_gpu.local_alloc %13 : (tensor<128x128xf8E4M3FNUZ, #blocked>) -> !tt.memdesc<128x128xf8E4M3FNUZ, #shared>
+      %14 = triton_gpu.local_alloc %13 : (tensor<128x128xf8E4M3FNUZ, #blocked>) -> !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared>
       %15 = tt.load %12 : tensor<128x128x!tt.ptr<f8E4M3FNUZ>, #blocked1>
-      %16 = triton_gpu.local_alloc %15 : (tensor<128x128xf8E4M3FNUZ, #blocked1>) -> !tt.memdesc<128x128xf8E4M3FNUZ, #shared1>
-      %17 = triton_nvidia_gpu.warp_group_dot %14, %16, %arg9 {inputPrecision = 0 : i32, maxNumImpreciseAcc = 1073741824 : i32} : !tt.memdesc<128x128xf8E4M3FNUZ, #shared> * !tt.memdesc<128x128xf8E4M3FNUZ, #shared1> -> tensor<128x128xf32, #mma>
+      %16 = triton_gpu.local_alloc %15 : (tensor<128x128xf8E4M3FNUZ, #blocked1>) -> !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared1>
+      %17 = triton_nvidia_gpu.warp_group_dot %14, %16, %arg9 {inputPrecision = 0 : i32, maxNumImpreciseAcc = 1073741824 : i32} : !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared> * !triton_gpu.memdesc<128x128xf8E4M3FNUZ, #shared1> -> tensor<128x128xf32, #mma>
       %18 = tt.splat %7 : f32 -> tensor<128x128xf32, #mma>
       %19 = arith.mulf %17, %18 : tensor<128x128xf32, #mma>
       %20 = scf.if %6 -> (tensor<128x128xf32, #mma>) {
@@ -852,9 +852,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
         scf.yield %arg5 : tensor<64x16x!tt.ptr<f16>, #blocked>
       }
       %18 = tt.load %inc_ptr : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
-      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) {
         %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1>
         scf.yield %acc_zero : tensor<128x16xf32, #mma1>
@@ -917,9 +917,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<128x64x!tt.ptr<f16>, #blocked1>)  : i32 {
       %9 = tt.load %arg6 : tensor<128x64x!tt.ptr<f16>, #blocked1>
       %18 = tt.load %arg5 : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
-      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
+      %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory>
+      %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !triton_gpu.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1>
       %cnd = arith.cmpi slt, %arg3, %ext : i32
       %if_ret:2 = scf.if %cnd -> (tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked>) {
         %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1>
@@ -988,8 +988,8 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
       %b_block = tt.load %arg6 : tensor<64x16x!tt.ptr<f16>, #blocked>
       %a_dotop = triton_gpu.convert_layout %a_block : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
       %a_dotop_mul = arith.mulf %a_dotop, %cst_4 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %b_smem = triton_gpu.local_alloc %b_block : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
-      %21 = triton_nvidia_gpu.warp_group_dot %a_dotop_mul, %b_smem, %arg4 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma>
+      %b_smem = triton_gpu.local_alloc %b_block : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory>
+      %21 = triton_nvidia_gpu.warp_group_dot %a_dotop_mul, %b_smem, %arg4 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma>
       %25 = tt.addptr %arg5, %cst_3 : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
       %26 = tt.addptr %arg6, %cst : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
       scf.yield %21, %25, %26 : tensor<128x16xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<64x16x!tt.ptr<f16>, #blocked>
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
index 973b35defb..bb7e102c90 100644
--- a/test/TritonGPU/loop-pipeline.mlir
+++ b/test/TritonGPU/loop-pipeline.mlir
@@ -930,9 +930,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
       %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma>
       %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma>
       %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory>
-      %25 = triton_gpu.memdesc_trans %24 {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory>
-      %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory>
+      %25 = triton_gpu.memdesc_trans %24 {order=array<i32: 1,0>} : !triton_gpu.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory>
+      %26 = triton_gpu.local_load %25 : !triton_gpu.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
       %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma>
       scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>
     }
@@ -1041,7 +1041,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK: scf.for
 // CHECK: %[[NEXT_BUFFER_1:.*]] = tt.addptr %{{.*}}, {{.*}}
 // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_1]]
-// CHECK: %[[IND_BUFFER_0:.*]] = triton_gpu.memdesc_subview {{.*}} : !tt.memdesc<1x16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable>
+// CHECK: %[[IND_BUFFER_0:.*]] = triton_gpu.memdesc_subview {{.*}} : !triton_gpu.memdesc<1x16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable> -> !triton_gpu.memdesc<16xi64, #[[$SHARED_LAYOUT]], #triton_gpu.shared_memory, mutable>
 // CHECK: %[[IND_BUFFER_1:.*]] = triton_gpu.local_load %[[IND_BUFFER_0]]
 // CHECK: %[[IND_BUFFER_2:.*]] = tt.expand_dims %[[IND_BUFFER_1]] {axis = 1 : i32}
 // CHECK: %[[IND_BUFFER_3:.*]] = tt.broadcast %[[IND_BUFFER_2]]
@@ -1361,9 +1361,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
     %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
     scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32  : i32 {
       %10 = tt.load %9 : tensor<16x16x!tt.ptr<f32>, #blocked>
-      %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory>
-      %12 = triton_gpu.memdesc_trans %11 {order = array<i32: 1, 0>} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory>
-      %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+      %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !triton_gpu.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory>
+      %12 = triton_gpu.memdesc_trans %11 {order = array<i32: 1, 0>} : !triton_gpu.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !triton_gpu.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory>
+      %13 = triton_gpu.local_load %12 : !triton_gpu.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
       scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32  : i32 {
         %14 = tt.load %9 : tensor<16x16x!tt.ptr<f32>, #blocked>
         %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
diff --git a/test/TritonGPU/pipeline-hopper-remove-wait.mlir b/test/TritonGPU/pipeline-hopper-remove-wait.mlir
index 5b0149c32b..cd93be2c47 100644
--- a/test/TritonGPU/pipeline-hopper-remove-wait.mlir
+++ b/test/TritonGPU/pipeline-hopper-remove-wait.mlir
@@ -108,11 +108,11 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
       %110 = tt.broadcast %109 : tensor<64x128xi64, #blocked> -> tensor<64x128xi64, #blocked>
       %111 = tt.addptr %101, %110 : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi64, #blocked>
       %112 = tt.load %111 : tensor<64x128x!tt.ptr<f16>, #blocked>
-      %113 = triton_gpu.local_alloc %38 : (tensor<128x128xf16, #blocked>) -> !tt.memdesc<128x128xf16, #shared>
-      %114 = triton_gpu.local_alloc %90 : (tensor<128x64xf16, #blocked2>) -> !tt.memdesc<128x64xf16, #shared1>
-      %115 = triton_nvidia_gpu.warp_group_dot %113, %114, %cst :!tt.memdesc<128x128xf16, #shared> * !tt.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma>
+      %113 = triton_gpu.local_alloc %38 : (tensor<128x128xf16, #blocked>) -> !triton_gpu.memdesc<128x128xf16, #shared>
+      %114 = triton_gpu.local_alloc %90 : (tensor<128x64xf16, #blocked2>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
+      %115 = triton_nvidia_gpu.warp_group_dot %113, %114, %cst :!triton_gpu.memdesc<128x128xf16, #shared> * !triton_gpu.memdesc<128x64xf16, #shared1> -> tensor<128x64xf32, #mma>
       %116 = arith.truncf %115 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma>
-      %117 = triton_gpu.local_alloc %112 : (tensor<64x128xf16, #blocked>) -> !tt.memdesc<64x128xf16, #shared>
+      %117 = triton_gpu.local_alloc %112 : (tensor<64x128xf16, #blocked>) -> !triton_gpu.memdesc<64x128xf16, #shared>
       %118 = triton_gpu.convert_layout %116 : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
       // The first dot gets converted to dot-async + wait.  The second one
       // doesn't have a wait because the first wait is sufficient.
@@ -121,7 +121,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
       // CHECK: triton_nvidia_gpu.warp_group_dot
       // CHECK-NOT: triton_nvidia_gpu.warp_group_dot_wait
       // CHECK: scf.yield
-      %119 = triton_nvidia_gpu.warp_group_dot %118, %117, %arg23 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !tt.memdesc<64x128xf16, #shared> -> tensor<128x128xf32, #mma1>
+      %119 = triton_nvidia_gpu.warp_group_dot %118, %117, %arg23 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !triton_gpu.memdesc<64x128xf16, #shared> -> tensor<128x128xf32, #mma1>
       %120 = arith.mulf %arg24, %arg25 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
       %121 = arith.addf %120, %arg25 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
       %122 = arith.extsi %c0_i32 : i32 to i64
diff --git a/test/TritonGPU/prefetch.mlir b/test/TritonGPU/prefetch.mlir
index 9fbc540b92..1c0eeeb666 100644
--- a/test/TritonGPU/prefetch.mlir
+++ b/test/TritonGPU/prefetch.mlir
@@ -48,24 +48,24 @@ tt.func @matmul_loop_mixed(%lb : index, %ub : index, %step : index, %A : !tt.ptr
   %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL>
 
   %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr<f8E5M2>, #AL>
-  %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A>
+  %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A>
   %b_ = tt.load %b_ptr_init, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>, #BL>
-  %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B>
+  %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B>
 
-  %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) {
-    %a_op_ = triton_gpu.local_load %a : !tt.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP>
+  %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) {
+    %a_op_ = triton_gpu.local_load %a : !triton_gpu.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP>
     %a_op = tt.fp_to_fp %a_op_ : tensor<128x32xf8E5M2, #A_OP> -> tensor<128x32xf16, #A_OP>
-    %b_op = triton_gpu.local_load %b : !tt.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP>
+    %b_op = triton_gpu.local_load %b : !triton_gpu.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP>
     %c = tt.dot %a_op, %b_op, %prev_c : tensor<128x32xf16, #A_OP> * tensor<32x128xf16, #B_OP> -> tensor<128x128xf32, #C>
 
     %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<128x32xi32, #AL>
     %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr<f16>, #BL>, tensor<32x128xi32, #BL>
     %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr<f8E5M2>, #AL>
-    %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A>
+    %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A>
     %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>, #BL>
-    %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B>
+    %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B>
 
-    scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>
+    scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>
   }
   tt.return %loop#4 : tensor<128x128xf32, #C>
 }
@@ -103,24 +103,24 @@ tt.func @matmul_loop_mixed(%lb : index, %ub : index, %step : index, %A : !tt.ptr
   %b_off = arith.constant dense<4> : tensor<16x128xi32, #BL>
 
   %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x16x!tt.ptr<f8E5M2>, #AL>
-  %a_init = triton_gpu.local_alloc %a_ : (tensor<128x16xf8E5M2, #AL>) -> !tt.memdesc<128x16xf8E5M2, #A>
+  %a_init = triton_gpu.local_alloc %a_ : (tensor<128x16xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x16xf8E5M2, #A>
   %b_ = tt.load %b_ptr_init, %b_mask, %b_other : tensor<16x128x!tt.ptr<f16>, #BL>
-  %b_init = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !tt.memdesc<16x128xf16, #B>
+  %b_init = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !triton_gpu.memdesc<16x128xf16, #B>
 
-  %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x16x!tt.ptr<f8E5M2>, #AL>, tensor<16x128x!tt.ptr<f16>, #BL>, !tt.memdesc<128x16xf8E5M2, #A>, !tt.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C>) {
-    %a_op_ = triton_gpu.local_load %a : !tt.memdesc<128x16xf8E5M2, #A> -> tensor<128x16xf8E5M2, #A_OP>
+  %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x16x!tt.ptr<f8E5M2>, #AL>, tensor<16x128x!tt.ptr<f16>, #BL>, !triton_gpu.memdesc<128x16xf8E5M2, #A>, !triton_gpu.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C>) {
+    %a_op_ = triton_gpu.local_load %a : !triton_gpu.memdesc<128x16xf8E5M2, #A> -> tensor<128x16xf8E5M2, #A_OP>
     %a_op = tt.fp_to_fp %a_op_ : tensor<128x16xf8E5M2, #A_OP> -> tensor<128x16xf16, #A_OP>
-    %b_op = triton_gpu.local_load %b : !tt.memdesc<16x128xf16, #B> -> tensor<16x128xf16, #B_OP>
+    %b_op = triton_gpu.local_load %b : !triton_gpu.memdesc<16x128xf16, #B> -> tensor<16x128xf16, #B_OP>
     %c = tt.dot %a_op, %b_op, %prev_c : tensor<128x16xf16, #A_OP> * tensor<16x128xf16, #B_OP> -> tensor<128x128xf32, #C>
 
     %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x16x!tt.ptr<f8E5M2>, #AL>, tensor<128x16xi32, #AL>
     %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<16x128x!tt.ptr<f16>, #BL>, tensor<16x128xi32, #BL>
     %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other : tensor<128x16x!tt.ptr<f8E5M2>, #AL>
-    %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x16xf8E5M2, #AL>) -> !tt.memdesc<128x16xf8E5M2, #A>
+    %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x16xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x16xf8E5M2, #A>
     %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other : tensor<16x128x!tt.ptr<f16>, #BL>
-    %next_b = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !tt.memdesc<16x128xf16, #B>
+    %next_b = triton_gpu.local_alloc %b_ : (tensor<16x128xf16, #BL>) -> !triton_gpu.memdesc<16x128xf16, #B>
 
-    scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x16x!tt.ptr<f8E5M2>, #AL>, tensor<16x128x!tt.ptr<f16>, #BL>, !tt.memdesc<128x16xf8E5M2, #A>, !tt.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C>
+    scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x16x!tt.ptr<f8E5M2>, #AL>, tensor<16x128x!tt.ptr<f16>, #BL>, !triton_gpu.memdesc<128x16xf8E5M2, #A>, !triton_gpu.memdesc<16x128xf16, #B>, tensor<128x128xf32, #C>
   }
   tt.return %loop#4 : tensor<128x128xf32, #C>
 }
@@ -221,24 +221,24 @@ tt.func @matmul_loop_mixed_amd(%lb : index, %ub : index, %step : index, %A : !tt
   %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL>
 
   %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr<f8E5M2>, #AL>
-  %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A>
+  %a_init = triton_gpu.local_alloc %a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A>
   %b_ = tt.load %b_ptr_init, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>, #BL>
-  %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B>
+  %b_init = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B>
 
-  %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) {
-    %a_op_ = triton_gpu.local_load %a : !tt.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP>
+  %loop:5 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>) {
+    %a_op_ = triton_gpu.local_load %a : !triton_gpu.memdesc<128x32xf8E5M2, #A> -> tensor<128x32xf8E5M2, #A_OP>
     %a_op = tt.fp_to_fp %a_op_ : tensor<128x32xf8E5M2, #A_OP> -> tensor<128x32xf16, #A_OP>
-    %b_op = triton_gpu.local_load %b : !tt.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP>
+    %b_op = triton_gpu.local_load %b : !triton_gpu.memdesc<32x128xf16, #B> -> tensor<32x128xf16, #B_OP>
     %c = tt.dot %a_op, %b_op, %prev_c : tensor<128x32xf16, #A_OP> * tensor<32x128xf16, #B_OP> -> tensor<128x128xf32, #C>
 
     %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<128x32xi32, #AL>
     %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr<f16>, #BL>, tensor<32x128xi32, #BL>
     %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr<f8E5M2>, #AL>
-    %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !tt.memdesc<128x32xf8E5M2, #A>
+    %next_a = triton_gpu.local_alloc %next_a_ : (tensor<128x32xf8E5M2, #AL>) -> !triton_gpu.memdesc<128x32xf8E5M2, #A>
     %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>, #BL>
-    %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !tt.memdesc<32x128xf16, #B>
+    %next_b = triton_gpu.local_alloc %b_ : (tensor<32x128xf16, #BL>) -> !triton_gpu.memdesc<32x128xf16, #B>
 
-    scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !tt.memdesc<128x32xf8E5M2, #A>, !tt.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>
+    scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr<f8E5M2>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, !triton_gpu.memdesc<128x32xf8E5M2, #A>, !triton_gpu.memdesc<32x128xf16, #B>, tensor<128x128xf32, #C>
   }
   tt.return %loop#4 : tensor<128x128xf32, #C>
 }
diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
index 9fca92c9b0..67bf5bdbcc 100644
--- a/test/TritonGPU/reduce-data-duplication.mlir
+++ b/test/TritonGPU/reduce-data-duplication.mlir
@@ -2,7 +2,7 @@
 
 //       CHECK:   #[[$SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
 //       CHECK-LABEL: apply_swizzle
-//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[$SHARED]], #triton_gpu.shared_memory>
+//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !triton_gpu.memdesc<16x256xf16, #[[$SHARED]], #triton_gpu.shared_memory>
 
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
diff --git a/test/TritonGPU/reorder-instructions.mlir b/test/TritonGPU/reorder-instructions.mlir
index dff1e6b60f..28f8d385cf 100644
--- a/test/TritonGPU/reorder-instructions.mlir
+++ b/test/TritonGPU/reorder-instructions.mlir
@@ -13,8 +13,8 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
     %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
     %9 = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
-    %10 = triton_gpu.local_alloc %9 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
-    %11 = triton_gpu.local_load %10 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %10 = triton_gpu.local_alloc %9 : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared>
+    %11 = triton_gpu.local_load %10 : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
     %12 = tt.dot %11, %cst_0, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma>
     %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked>
     tt.store %arg0, %13 : tensor<32x32x!tt.ptr<f32>, #blocked>
@@ -26,20 +26,20 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
 
 // CHECK-LABEL: sink_convert_dealloc
 //       CHECK: triton_gpu.async_wait {num = 0 : i32}
-//       CHECK: triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable>
-//       CHECK: triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable>
+//       CHECK: triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+//       CHECK: triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
 //       CHECK: %3 = triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1>
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}>
 #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func public @sink_convert_dealloc(%arg0: tensor<32x32xf32, #blocked>) attributes {noinline = false} {
-    %0 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable>
-    %1 = triton_gpu.local_alloc : () -> !tt.memdesc<4x128x64xf16, #shared, mutable>
+    %0 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+    %1 = triton_gpu.local_alloc : () -> !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
     %2 = triton_gpu.convert_layout %arg0 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1>
     triton_gpu.async_wait {num = 0 : i32}
-    triton_gpu.local_dealloc %0 : !tt.memdesc<4x128x64xf16, #shared, mutable>
-    triton_gpu.local_dealloc %1 : !tt.memdesc<4x128x64xf16, #shared, mutable>
+    triton_gpu.local_dealloc %0 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
+    triton_gpu.local_dealloc %1 : !triton_gpu.memdesc<4x128x64xf16, #shared, mutable>
     %3 = arith.addf %2, %2 : tensor<32x32xf32, #blocked1>
     tt.return
   }
@@ -48,8 +48,8 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
 // -----
 
 // CHECK-LABEL: sink_convert_idx_1
-//       CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
-//       CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+//       CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+//       CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
 //       CHECK: tt.dot
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2]}>
@@ -58,12 +58,12 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
   tt.func public @sink_convert_idx_1(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked>) attributes {noinline = false} {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
     %B = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
-    %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
-    %BD = triton_gpu.local_load %BS : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+    %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared>
+    %BD = triton_gpu.local_load %BS : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
     %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
     %A = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
-    %AS = triton_gpu.local_alloc %A : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
-    %AD = triton_gpu.local_load %AS : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %AS = triton_gpu.local_alloc %A : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared>
+    %AD = triton_gpu.local_load %AS : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
     %12 = tt.dot %AD, %BD, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma>
     %13 = triton_gpu.convert_layout %12 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked>
     tt.store %arg0, %13 : tensor<32x32x!tt.ptr<f32>, #blocked>
@@ -75,10 +75,10 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
 
 // check that we don't sink convert_layout if it has multi users
 // CHECK-LABEL: convert_cannot_sink
-//       CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
-//       CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+//       CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+//       CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
 //       CHECK: tt.dot
-//       CHECK: triton_gpu.local_load %{{.*}} : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+//       CHECK: triton_gpu.local_load %{{.*}} : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
 //       CHECK: tt.dot
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2]}>
@@ -87,15 +87,15 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
   tt.func public @convert_cannot_sink(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked>) attributes {noinline = false} {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
     %B = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
-    %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
-    %BD = triton_gpu.local_load %BS : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+    %BS = triton_gpu.local_alloc %B : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared>
+    %BD = triton_gpu.local_load %BS : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
     %A0 = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
-    %AS0 = triton_gpu.local_alloc %A0 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
-    %AD0 = triton_gpu.local_load %AS0 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %AS0 = triton_gpu.local_alloc %A0 : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared>
+    %AD0 = triton_gpu.local_load %AS0 : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
     %12 = tt.dot %AD0, %BD, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma>
     %A1 = tt.load %arg0 : tensor<32x32x!tt.ptr<f32>, #blocked>
-    %AS1 = triton_gpu.local_alloc %A1 : (tensor<32x32xf32, #blocked>) -> !tt.memdesc<32x32xf32, #shared>
-    %AD1 = triton_gpu.local_load %AS1 : !tt.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %AS1 = triton_gpu.local_alloc %A1 : (tensor<32x32xf32, #blocked>) -> !triton_gpu.memdesc<32x32xf32, #shared>
+    %AD1 = triton_gpu.local_load %AS1 : !triton_gpu.memdesc<32x32xf32, #shared> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
     %13 = tt.dot %AD1, %BD, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma>
     tt.return
   }
diff --git a/test/TritonGPU/tritongpu_ops.mlir b/test/TritonGPU/tritongpu_ops.mlir
index d5c6a52e8e..3fc0585b12 100644
--- a/test/TritonGPU/tritongpu_ops.mlir
+++ b/test/TritonGPU/tritongpu_ops.mlir
@@ -4,8 +4,8 @@
 
 module attributes {"triton_gpu.target" = "cuda:0", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: memdesc
-  // CHECK-SAME: !tt.memdesc<1x64x16xf16, #{{.+}}>
-  tt.func @memdesc(%d : !tt.memdesc<1x64x16xf16, #shared0>) {
+  // CHECK-SAME: !triton_gpu.memdesc<1x64x16xf16, #{{.+}}>
+  tt.func @memdesc(%d : !triton_gpu.memdesc<1x64x16xf16, #shared0>) {
     tt.return
   }
 }
diff --git a/test/TritonNvidiaGPU/membar.mlir b/test/TritonNvidiaGPU/membar.mlir
index 6d9c166508..924216222a 100644
--- a/test/TritonNvidiaGPU/membar.mlir
+++ b/test/TritonNvidiaGPU/membar.mlir
@@ -9,8 +9,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 	// CHECK-NEXT: init_barrier
   tt.func @init_barrier() {
   	%cst = arith.constant dense<0> : tensor<1xi64, #blocked0>
-  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
-    triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+    triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
     tt.return
   }
 }
@@ -28,9 +28,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 	// CHECK-NEXT: inval_barrier
   tt.func @inval_barrier() {
   	%cst = arith.constant dense<0> : tensor<1xi64, #blocked0>
-  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
-    triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
-		triton_nvidia_gpu.inval_barrier %alloc : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+    triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+		triton_nvidia_gpu.inval_barrier %alloc : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
     tt.return
   }
 }
@@ -48,8 +48,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 	// CHECK-NEXT: barrier_expect
   tt.func @barrier_expect(%pred : i1) {
   	%cst = arith.constant dense<0> : tensor<1xi64, #blocked0>
-  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
-    triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+    triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
     triton_nvidia_gpu.barrier_expect %alloc, 16384, %pred : <1xi64, #shared0, #triton_gpu.shared_memory, mutable>
     tt.return
   }
@@ -68,8 +68,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 	// CHECK-NEXT: wait_barrier
   tt.func @wait_barrier(%phase : i32) {
   	%cst = arith.constant dense<0> : tensor<1xi64, #blocked0>
-  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
-    triton_nvidia_gpu.init_barrier %alloc, 1 : !tt.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	%alloc = triton_gpu.local_alloc %cst : (tensor<1xi64, #blocked0>) -> !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
+    triton_nvidia_gpu.init_barrier %alloc, 1 : !triton_gpu.memdesc<1xi64, #shared0, #triton_gpu.shared_memory, mutable>
     triton_nvidia_gpu.wait_barrier %alloc, %phase : <1xi64, #shared0, #triton_gpu.shared_memory, mutable>
     tt.return
   }
@@ -89,8 +89,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 		// CHECK-NEXT: gpu.barrier
 		// CHECK-NEXT: init_barrier
   	%cst = arith.constant dense<0> : tensor<128x64xi64, #blocked0>
-  	%alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
-  	triton_gpu.local_dealloc %alloc : !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	%alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	triton_gpu.local_dealloc %alloc : !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
     %l = tt.experimental_descriptor_load %arg0[%arg1, %arg1] : !tt.tensordesc<tensor<128x64xf16>> -> tensor<128x64xf16, #blocked0>
     tt.return %l : tensor<128x64xf16, #blocked0>
   }
@@ -108,8 +108,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 //       CHECK-NEXT: triton_gpu.local_alloc
   tt.func public @tma_store(%arg0: !tt.tensordesc<tensor<128x256xf32>>, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: tensor<128x256xf32, #blocked0>) {
   	%cst = arith.constant dense<0> : tensor<128x64xi64, #blocked0>
-  	%alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
-  	triton_gpu.local_dealloc %alloc : !tt.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	%alloc = triton_gpu.local_alloc %cst : (tensor<128x64xi64, #blocked0>) -> !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
+  	triton_gpu.local_dealloc %alloc : !triton_gpu.memdesc<128x64xi64, #shared0, #triton_gpu.shared_memory, mutable>
     tt.experimental_descriptor_store %arg0[%arg1, %arg1], %arg2 : !tt.tensordesc<tensor<128x256xf32>>, tensor<128x256xf32, #blocked0>
     tt.return
   }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index d3ffaed2e8..208483beb8 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -9,6 +9,7 @@ using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
 using ::mlir::triton::gpu::AMDWmmaEncodingAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
+using ::mlir::triton::gpu::MemDescType;
 using ::mlir::triton::gpu::SharedEncodingAttr;
 
 namespace SharedToDotOperandMFMA {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
index 9043090802..c79df66c48 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
@@ -198,7 +198,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
                     const SharedMemoryObject &smemObj,
                     const LLVMTypeConverter *typeConverter, Value thread) {
   assert((opIdx == 0 || opIdx == 1) && "unexpected operand idx");
-  auto aTensorTy = cast<MemDescType>(tensor.getType());
+  auto aTensorTy = cast<triton::gpu::MemDescType>(tensor.getType());
   ArrayRef<int64_t> shape = aTensorTy.getShape();
   auto rank = shape.size();
   int kDimIdx = opIdx == 0 ? rank - 1 : rank - 2;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
index 1ca9e49745..7f037b89b8 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
@@ -152,7 +152,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
   assert(wmmaLayout.getMNKDimPerInstr()[nonKDimIdx] == 16);
   auto warpsPerCTA = wmmaLayout.getWarpsPerCTA();
 
-  auto aTensorTy = cast<MemDescType>(tensor.getType());
+  auto aTensorTy = cast<triton::gpu::MemDescType>(tensor.getType());
   ArrayRef<int64_t> shape = aTensorTy.getShape();
   auto sharedLayout = cast<SharedEncodingAttr>(aTensorTy.getEncoding());
   auto order = sharedLayout.getOrder();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
index 1a4dd8227c..2088fd8073 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
@@ -258,7 +258,7 @@ void StreamPipeliner::createStreamCopy(tt::LoadOp loadOp, Value alloc,
   Value mask = loadOp.getMask();
   Value other = loadOp.getOther();
 
-  tt::MemDescType allocTy = cast<tt::MemDescType>(alloc.getType());
+  ttg::MemDescType allocTy = cast<ttg::MemDescType>(alloc.getType());
   SmallVector<Value> copyOffsets(allocTy.getRank(), zero);
   Operation *copy = builder.clone(*loadOp);
 
@@ -271,7 +271,7 @@ void StreamPipeliner::createStreamCopy(tt::LoadOp loadOp, Value alloc,
   loadOffsets[0] = extractIdx;
   auto sharedMemorySpace =
       triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext());
-  auto subviewTy = tt::MemDescType::get(
+  auto subviewTy = ttg::MemDescType::get(
       allocTy.getShape().drop_front(), allocTy.getElementType(),
       allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true);
   auto viewLoad =
@@ -330,7 +330,7 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
     if (user->getNumResults() != 1)
       return std::nullopt;
     if (auto memDesc =
-            dyn_cast<triton::MemDescType>(user->getResult(0).getType())) {
+            dyn_cast<triton::gpu::MemDescType>(user->getResult(0).getType())) {
       // First time we find a shared encoding in the chain, save it and try to
       // use it if it is compatible with the other users.
       tempAttr = cast<ttg::SharedEncodingAttr>(memDesc.getEncoding());
@@ -340,10 +340,11 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
         return std::nullopt;
       auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
+          cast<triton::gpu::TensorOrMemDesc>(user->getResult(0).getType())
+              .getEncoding());
       if (!dotOpEnc)
         return std::nullopt;
-      auto srcTy = cast<TensorOrMemDesc>(val.getType());
+      auto srcTy = cast<triton::gpu::TensorOrMemDesc>(val.getType());
       auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
       auto order = ttg::getOrder(srcTy.getEncoding());
       unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
@@ -669,9 +670,9 @@ Value StreamPipeliner::createAlloc(Operation *loadOp,
   auto ty = cast<RankedTensorType>(loadOp->getResultTypes()[0]);
   SmallVector<int64_t> bufferShape(ty.getShape().begin(), ty.getShape().end());
   bufferShape.insert(bufferShape.begin(), numBuffers);
-  Type memdescType = tt::MemDescType::get(bufferShape, ty.getElementType(),
-                                          sharedEnc, sharedMemorySpace,
-                                          /*mutableMemory=*/true);
+  Type memdescType = ttg::MemDescType::get(bufferShape, ty.getElementType(),
+                                           sharedEnc, sharedMemorySpace,
+                                           /*mutableMemory=*/true);
   auto alloc =
       builder.create<ttg::LocalAllocOp>(loadOp->getLoc(), memdescType, Value());
   sharedMemAllocs.push_back(alloc);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp
index 8c91eb2ebf..b9aac96cbf 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp
@@ -14,6 +14,7 @@ using ::mlir::triton::gpu::getOrder;
 using ::mlir::triton::gpu::getShapePerCTA;
 using ::mlir::triton::gpu::getSizePerThread;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
+using ::mlir::triton::gpu::MemDescType;
 using ::mlir::triton::gpu::SharedEncodingAttr;
 
 // Data loader for mma.16816 instruction.
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
index 40cb55bbc0..29a76f6033 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -50,7 +50,7 @@ class DecomposeLocalLoadToDotOperand
                                          blockEncoding);
     Value load =
         rewriter.create<LocalLoadOp>(op.getLoc(), tmpType, op.getSrc());
-    auto newSharedDescTy = triton::MemDescType::get(
+    auto newSharedDescTy = MemDescType::get(
         type.getShape(), type.getElementType(),
         triton::gpu::SharedEncodingAttr::get(
             op.getContext(), dstDotOp, type.getShape(),
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
index 9b1667db70..85f7da2cb5 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
@@ -30,6 +30,7 @@ using namespace mlir::triton;
 using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
 using ::mlir::triton::gpu::getShapePerCTA;
 using ::mlir::triton::gpu::getShapePerCTATile;
+using ::mlir::triton::gpu::MemDescType;
 using ::mlir::triton::gpu::NvidiaMmaEncodingAttr;
 using ::mlir::triton::gpu::SharedEncodingAttr;
 
@@ -47,7 +48,7 @@ triton::nvgpu::WGMMAEltType getMmaRetType(Value d) {
 }
 
 triton::nvgpu::WGMMAEltType getMmaOperandType(Value a, bool allowTF32) {
-  auto aTy = cast<TensorOrMemDesc>(a.getType()).getElementType();
+  auto aTy = cast<triton::gpu::TensorOrMemDesc>(a.getType()).getElementType();
   if (aTy.isF16()) {
     return triton::nvgpu::WGMMAEltType::f16;
   } else if (aTy.isBF16()) {
@@ -197,7 +198,7 @@ DotOpMmaV3SmemLoader loadA(const LLVMTypeConverter *typeConverter,
                            ConversionPatternRewriter &rewriter, Location loc,
                            const NvidiaMmaEncodingAttr &mmaEncoding,
                            Value tensor, Value smemObjBase, Value thread) {
-  auto aTy = cast<TensorOrMemDesc>(tensor.getType());
+  auto aTy = cast<triton::gpu::TensorOrMemDesc>(tensor.getType());
   auto aSharedLayout = dyn_cast<SharedEncodingAttr>(aTy.getEncoding());
   assert(aSharedLayout && "only support load dot operand from shared.");
   auto instrShape = mmaEncoding.getInstrShape();
@@ -378,8 +379,8 @@ LogicalResult convertDot(const LLVMTypeConverter *typeConverter,
                          Value loadedC, bool allowTF32,
                          bool needsPartialAccumulator,
                          uint32_t maxNumImpreciseAcc, bool sync, Value thread) {
-  auto aTensorTy = cast<TensorOrMemDesc>(a.getType());
-  auto bTensorTy = cast<TensorOrMemDesc>(b.getType());
+  auto aTensorTy = cast<triton::gpu::TensorOrMemDesc>(a.getType());
+  auto bTensorTy = cast<triton::gpu::TensorOrMemDesc>(b.getType());
   auto dTensorTy = cast<RankedTensorType>(d.getType());
   auto aSharedLayout = dyn_cast<SharedEncodingAttr>(aTensorTy.getEncoding());
   auto bSharedLayout = cast<SharedEncodingAttr>(bTensorTy.getEncoding());

From c8a31a025df9726c20c6da9c67484084d32a705b Mon Sep 17 00:00:00 2001
From: Thomas Raoux <thomas.raoux@openai.com>
Date: Thu, 21 Nov 2024 14:55:49 -0800
Subject: [PATCH 02/12] [AMD] Prevent wrong reordering of scf operations
 (#5203)

The pass was reordering scf.if operations without checking the extra
dependencies coming from the region.
For now just prevent this case although this part of the code might
still be fragile.
---
 .../amd/amd-reorder-instructions.mlir         | 37 +++++++++++++++++++
 .../ReorderInstructions.cpp                   | 11 ++++++
 2 files changed, 48 insertions(+)

diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir
index 51353d31c3..708d75a232 100644
--- a/test/TritonGPU/amd/amd-reorder-instructions.mlir
+++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir
@@ -499,3 +499,40 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
     tt.return
   }
 }
+
+
+// -----
+
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, triton_gpu.target = "hip:gfx90a", "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: dont_hoist_scf_ops
+  // Make sure we don't hoist scf ops above its dependencies.
+  tt.func public @dont_hoist_scf_ops(%init: tensor<256x128xf32, #mfma>,
+    %base: tensor<256x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>,
+    %p1: tensor<128x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>, %i1: i1) -> (tensor<256x128xf32, #mfma>) {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %cst = arith.constant 1.44269502 : f32
+    %c128_i32 = arith.constant 128 : i32
+    // CHECK: scf.for
+    %54 = scf.for %arg21 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg = %init) -> (tensor<256x128xf32, #mfma>)  : i32 {
+      // CHECK: arith.addi
+      %f = arith.addi %arg21, %c128_i32 : i32
+      // CHECK: scf.if
+      // CHECK: tt.load
+      %p0 = scf.if %i1 -> tensor<256x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>{
+        %t = tt.splat %f : i32 -> tensor<256x128xi32>
+        %padd = tt.addptr %base, %t : tensor<256x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>, tensor<256x128xi32>
+        scf.yield %padd : tensor<256x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
+      } else {
+        scf.yield %base : tensor<256x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
+      }
+      %l = tt.load %p0 : tensor<256x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>>
+      %r = tt.load %p1 : tensor<128x128x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>>
+      %acc = tt.dot %l, %r, %arg : tensor<256x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth = 4}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth = 4}>> -> tensor<256x128xf32, #mfma>
+      scf.yield %acc : tensor<256x128xf32, #mfma>
+    }
+    tt.return %54 : tensor<256x128xf32, #mfma>
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
index f55ab78554..0837f16dcf 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -227,6 +227,7 @@ static void scheduleGlobalLoadLocalStore(triton::FuncOp funcOp) {
     // Gather use-def chain in block.
     Block *block = op->getBlock();
     bool leadsToLoad = false;
+    bool dontReorder = false;
     SetVector<Operation *> backwardSet;
 
     BackwardSliceOptions options;
@@ -236,6 +237,13 @@ static void scheduleGlobalLoadLocalStore(triton::FuncOp funcOp) {
       Block *defBlock = defOp->getBlock();
       if (!block->findAncestorOpInBlock(*defOp))
         return false;
+      // Don't hoist control flow as we don't track backtraces of ops within
+      // their regions.
+      if (isa<scf::IfOp, scf::ForOp, scf::WhileOp>(defOp)) {
+        dontReorder = true;
+        return false;
+      }
+
       // Check for a `load` dependent path.
       leadsToLoad |= isa<triton::LoadOp>(defOp);
       // Only move ops residing in the same block.
@@ -244,6 +252,9 @@ static void scheduleGlobalLoadLocalStore(triton::FuncOp funcOp) {
     mlir::getBackwardSlice(op, &backwardSet, options);
     backwardSet.insert(op);
 
+    // If we found ops in the slice we don't want to hoist.
+    if (dontReorder)
+      continue;
     // Don't move a local_store if its source is a load from
     // the same iteration.
     if (isa<ttg::LocalStoreOp>(op) && leadsToLoad)

From 3164a4e2d5a9cb63527bd6226c6fe2405188eb66 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 21 Nov 2024 17:33:07 -0600
Subject: [PATCH 03/12] [AMD] Cover default case in MfmaGroup (#5218)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If you build using the `CMakeLists.txt` and not `setup.py` and you build
in `Release` then you get

```
/__w/triton/triton/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp: In function ‘std::pair<mlir::Type, mlir::Type> mlir::TypesFromMfmaId(MLIRContext*, MfmaTypeId)’:
Warning: /__w/triton/triton/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp:240:1: warning: control reaches end of non-void function [-Wreturn-type]
```
---
 third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp
index d3b2b70f85..9fce18e21f 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp
@@ -235,8 +235,9 @@ std::pair<mlir::Type, mlir::Type> TypesFromMfmaId(mlir::MLIRContext *ctx,
     return {f8e5m2fnuz, f8e4m3fnuz};
   case MfmaTypeId::Bf8Bf8TyId:
     return {f8e5m2fnuz, f8e5m2fnuz};
+  default:
+    llvm_unreachable("unsupported MfmaTypeId!");
   }
-  assert(false && "unsupported MfmaTypeId");
 }
 
 FailureOr<MfmaInsn> MfmaInsn::selectMfma(unsigned mDim, unsigned nDim,

From 82b8f0f5983fb7fd5b40b52089f8177a0ea6561a Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <matthewbrookhart@gmail.com>
Date: Thu, 21 Nov 2024 16:53:48 -0700
Subject: [PATCH 04/12] Allow Layouts to propogate to local_load (#5219)

While working on some higher dimension tensor kernels, I noticed poor
performance due to the fact that layouts wouldn't propagate to local
loads. Since we do allow layout folding with local store and local
alloc, this seems like a bit of an oversight.

The change gives a 40% speed improvement on certain kernels for NVidia
GPUs.

This also removes asserts in lowering for higher dimensional kernels. As
far as I can tell, those restrictions aren't required in practice.


# New contributor declaration
- [x] I am not making a trivial change, such as fixing a typo in a
comment.
- [x] I have written a PR description following these
[rules](https://cbea.ms/git-commit/#why-not-how).
- [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`.
- [x] I have added tests.
- [x] The `lit` tests I have added follow these [best
practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices)
---
 lib/Dialect/TritonGPU/Transforms/Utility.cpp |  3 ++-
 test/TritonGPU/combine.mlir                  | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index b8f3abfcac..5914eb2905 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -563,7 +563,8 @@ bool canFoldIntoConversion(Operation *op, Attribute targetEncoding) {
   }
   return isa<triton::gpu::ConvertLayoutOp, arith::ConstantOp,
              triton::MakeRangeOp, triton::SplatOp, triton::HistogramOp,
-             triton::gpu::LocalAllocOp, triton::gpu::LocalStoreOp>(op);
+             triton::gpu::LocalAllocOp, triton::gpu::LocalLoadOp,
+             triton::gpu::LocalStoreOp>(op);
 }
 
 scf::ForOp replaceForOpWithNewSignature(
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
index 5e1cad52af..129eb8c101 100644
--- a/test/TritonGPU/combine.mlir
+++ b/test/TritonGPU/combine.mlir
@@ -2685,3 +2685,21 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
     tt.return
   }
 }
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 1, 2, 2, 1], order = [4, 0, 1, 2, 3]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [1, 1, 32, 1, 1], warpsPerCTA = [1, 1, 1, 1, 4], order = [4, 3, 2, 1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 2, 2, 1, 1], order = [4, 0, 3, 2, 1]}>
+#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 0, 1, 2, 3], hasLeadingOffset = false}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:100", "triton_gpu.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: lift_convert_to_local_load
+  // CHECK-NOT: convert_layout
+  // CHECK: tt.return
+  tt.func public @lift_convert_to_local_load(%arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable>) -> tensor<2x4x32x1x4xi8, #blocked2> {
+    %1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable> -> tensor<2x1x32x4x4xi8, #blocked>
+    %2 = tt.trans %1 {order = array<i32: 0, 3, 2, 1, 4>} : tensor<2x1x32x4x4xi8, #blocked> -> tensor<2x4x32x1x4xi8, #blocked1>
+    %3 = triton_gpu.convert_layout %2 : tensor<2x4x32x1x4xi8, #blocked1> -> tensor<2x4x32x1x4xi8, #blocked2>
+    tt.return %3 : tensor<2x4x32x1x4xi8, #blocked2>
+  }
+}

From 4ae95e70cd81eb62f89ec530605440b85e799dee Mon Sep 17 00:00:00 2001
From: Thomas Raoux <thomas.raoux@openai.com>
Date: Thu, 21 Nov 2024 19:42:10 -0800
Subject: [PATCH 05/12] [BACKEND] Fix transpose optimization missed during
 refactor (#5226)

---
 .../Transforms/OptimizeDotOperands.cpp        |  4 ++--
 test/TritonGPU/dot-operands.mlir              | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
index b6b376101a..c776944a24 100644
--- a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -326,13 +326,13 @@ class FuseTransHopper : public OpRewritePattern<LocalAllocOp> {
       return failure();
 
     // Match outerCvt(trans(innerCvt(x))).
-    auto trans = allocOp.getSrc().getDefiningOp<MemDescTransOp>();
+    auto trans = allocOp.getSrc().getDefiningOp<TransOp>();
     if (!trans || trans.getOrder() != ArrayRef<int32_t>({1, 0}))
       return failure();
 
     MemDescType allocType = allocOp.getType();
     auto allocEncoding = cast<SharedEncodingAttr>(allocType.getEncoding());
-    MemDescType srcTy = trans.getSrc().getType();
+    RankedTensorType srcTy = trans.getSrc().getType();
 
     // MMAv3 with transpose only supports f16 and bf16.  Fall back to MMAv3
     // without transpose for other data types.)
diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir
index 911cf4fb40..990a0b4f7a 100644
--- a/test/TritonGPU/dot-operands.mlir
+++ b/test/TritonGPU/dot-operands.mlir
@@ -256,3 +256,23 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
     tt.return %r : tensor<128x64xf32, #mma>
   }
 }
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1]}>
+#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
+#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
+module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+// CHECK-LABEL: mma_reorder_transpose
+// CHECK: triton_gpu.local_alloc
+// CHECK: triton_gpu.memdesc_trans
+// CHECK: triton_nvidia_gpu.warp_group_dot
+  tt.func @mma_reorder_transpose(%t: tensor<64x128xf16, #blocked1>, %dotb: !triton_gpu.memdesc<64x64xf16, #shared>, %dotc: tensor<128x64xf32, #mma>) -> tensor<128x64xf32, #mma>{
+    %a = tt.trans %t {order = array<i32: 1, 0>} : tensor<64x128xf16, #blocked1> -> tensor<128x64xf16, #blocked>
+    %dota = triton_gpu.local_alloc %a: (tensor<128x64xf16, #blocked>) -> !triton_gpu.memdesc<128x64xf16, #shared1>
+    %r = triton_nvidia_gpu.warp_group_dot %dota, %dotb, %dotc : !triton_gpu.memdesc<128x64xf16, #shared1> * !triton_gpu.memdesc<64x64xf16, #shared> -> tensor<128x64xf32, #mma>
+    tt.return %r : tensor<128x64xf32, #mma>
+  }
+}

From af0649d2fcbbe8e486a5a2d397d798a476b9c22a Mon Sep 17 00:00:00 2001
From: ilia-cher <30845429+ilia-cher@users.noreply.github.com>
Date: Fri, 22 Nov 2024 01:11:56 -0600
Subject: [PATCH 06/12] [AMD] Use warp shuffle for fp8 MFMA to dot operand
 layout conversion (#5139)

Adding a shortcut case for fp8 MFMA to dot operand layout conversion
that avoids using shared memory, to speed up FP8 attention kernels.
---
 include/triton/Analysis/Utility.h             |   5 +
 lib/Analysis/Utility.cpp                      |  25 ++-
 .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp |   6 +
 test/Conversion/amd/mfma-shortcut.mlir        | 190 +++++++++++++++++-
 .../ConvertLayoutOpToLLVM.cpp                 | 154 ++++++++++++++
 5 files changed, 378 insertions(+), 2 deletions(-)

diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
index df6029db0d..ae517912fb 100644
--- a/include/triton/Analysis/Utility.h
+++ b/include/triton/Analysis/Utility.h
@@ -218,6 +218,11 @@ bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
                                    RankedTensorType dstTy);
 
+// Check if MFMA layout can be converted to the dot operand
+// layout using warp shuffle.
+bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
+                                       RankedTensorType dstTy);
+
 // TODO: Move utility functions that belong to ConvertLayoutOp to class
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 6166e10199..5fd87e4c01 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Support/LLVM.h"
+#include "triton/Conversion/MLIRTypes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -632,6 +633,25 @@ bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
   return ans;
 }
 
+bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
+                                       RankedTensorType dstTy) {
+  auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcTy.getEncoding());
+  auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
+  if (!mfmaLayout || !dotOperandLayout)
+    return false;
+
+  // Currently supporting 32x32 and 16x16 FP8 MFMA -> dot operand case
+  return dotOperandLayout.getParent() == mfmaLayout &&
+         dotOperandLayout.getOpIdx() == 0 && mfmaLayout.getIsTransposed() &&
+         dotOperandLayout.getKWidth() == 8 &&
+         getContigPerThread(mfmaLayout)[1] == 4 &&
+         ((mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16) ||
+          (mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32)) &&
+         triton::type::isFloat8(srcTy.getElementType()) &&
+         triton::type::isFloat8(dstTy.getElementType()) &&
+         mfmaLayout.getWarpsPerCTA()[1] == 1;
+}
+
 // We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
 // under kBlock, kWarp or kLane (in that order). The idea here is that if we
 // have a transformation that's the identity on kBlock, we don't need to use
@@ -730,7 +750,10 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
   // supported yet in Triton's backend.
   return !cvtReordersRegisters(srcTy, dstTy) &&
          !isBlockedToDotShortcut(srcTy, dstTy) &&
-         !matchMmaV3AndDotOperandLayout(srcTy, dstTy);
+         !matchMmaV3AndDotOperandLayout(srcTy, dstTy) &&
+         // to be removed when generalized warp shuffle conversions
+         // are ready:
+         !matchMFMAAndDotOperandShuffleCase(srcTy, dstTy);
 }
 
 bool atomicNeedsSharedMemory(Value value) {
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index aab97c7dd2..f0026c1993 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -409,6 +409,12 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       return failure();
     }
 
+    // The following check can be removed when generalized warp shuffle
+    // conversions are ready:
+    if (matchMFMAAndDotOperandShuffleCase(srcTy, dstTy)) {
+      return failure();
+    }
+
     assert(cvtNeedsSharedMemory(srcTy, dstTy));
 
     SmallVector<Value> inVals =
diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir
index a2c8f48718..bcbc7eff59 100644
--- a/test/Conversion/amd/mfma-shortcut.mlir
+++ b/test/Conversion/amd/mfma-shortcut.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --decompose-unsupported-amd-conversions --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx90a" -split-input-file | FileCheck %s
+// RUN: triton-opt %s --decompose-unsupported-amd-conversions --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx942" -split-input-file | FileCheck %s
 
 #mfma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
 #dotop = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=4}>
@@ -27,3 +27,191 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return
   }
 }
+
+// -----
+
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: mfma_dot_cvt_f8_mfma32
+  tt.func public @mfma_dot_cvt_f8_mfma32(%arg0: tensor<128x32xf8E4M3FNUZ, #mfma>) {
+    // CHECK-NOT: store
+    // CHECK-NOT: load
+
+    // CHECK: [[val3:%.*]] = llvm.extractvalue %arg0[3]
+    // CHECK: [[val7:%.*]] = llvm.extractvalue %arg0[7]
+
+    // CHECK-DAG: [[c32:%.*]] = llvm.mlir.constant(32 : i32)
+    // CHECK-DAG: [[c64:%.*]] = llvm.mlir.constant(64 : i32)
+
+    // CHECK: [[threadId:%.*]] = rocdl.workitem.id.x
+    // CHECK: [[laneId:%.*]] = llvm.urem [[threadId]], [[c64]]
+    // CHECK: [[mask0:%.*]] = llvm.icmp "slt" [[laneId]], [[c32]]
+
+    // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c32]]
+    // CHECK: [[addr32:%.*]] = llvm.urem [[shflLaneId]], [[c64]]
+
+    // CHECK: [[vec0:%.*]] = llvm.insertelement [[val3]], {{.*}} : vector<4xi8>
+    // CHECK: [[vec1:%.*]] = llvm.insertelement [[val7]], {{.*}} : vector<4xi8>
+
+    // CHECK: [[bvec0:%.*]] = llvm.bitcast [[vec0]]
+    // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32)
+    // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]]
+    // CHECK: [[bShflVec0:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec0]]
+    // CHECK: [[shflVec0:%.*]] = llvm.bitcast [[bShflVec0]]
+
+    // CHECK: [[bvec1:%.*]] = llvm.bitcast [[vec1]]
+    // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32)
+    // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]]
+    // CHECK: [[bShflVec1:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec1]]
+    // CHECK: [[shflVec1:%.*]] = llvm.bitcast [[bShflVec1]]
+
+    // Input (8 values): (vec0, vec1)
+    // Output (8 values shuffled, '>> n' - take the value from (lane + n) % 64):
+    //                 resVec0     resVec1
+    //   lanes  0-31: (vec0      , vec0 >> 32) (mask0=1)
+    //   lanes 32-63: (vec1 >> 32, vec1      ) (mask0=0)
+
+    // CHECK: [[resVec0:%.*]] = llvm.select [[mask0]], [[vec0]], [[shflVec1]]
+    // CHECK: [[resVec1:%.*]] = llvm.select [[mask0]], [[shflVec0]], [[vec1]]
+
+    // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32)
+    // CHECK: [[resVal3:%.*]] = llvm.extractelement [[resVec0]][[[c3]] : i32] : vector<4xi8>
+    // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32) : i32
+    // CHECK: [[resVal7:%.*]] = llvm.extractelement [[resVec1]][[[c3]] : i32] : vector<4xi8>
+
+    // CHECK: llvm.insertvalue [[resVal3]], {{.*}}[3]
+    // CHECK: llvm.insertvalue [[resVal7]], {{.*}}[7]
+
+    // CHECK: llvm.return
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E4M3FNUZ, #mfma> -> tensor<128x32xf8E4M3FNUZ, #dotop0>
+    tt.return
+  }
+}
+
+// -----
+
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: mfma_dot_cvt_bf8_mfma32
+  tt.func public @mfma_dot_cvt_bf8_mfma32(%arg0: tensor<128x32xf8E5M2, #mfma>) {
+    // CHECK-NOT: store
+    // CHECK-NOT: load
+    // CHECK: rocdl.ds_bpermute
+    // CHECK: llvm.return
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E5M2, #mfma> -> tensor<128x32xf8E5M2, #dotop0>
+    tt.return
+  }
+}
+
+// -----
+
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
+#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: mfma_dot_cvt_f8_mfma16
+  tt.func public @mfma_dot_cvt_f8_mfma16(%arg0: tensor<128x32xf8E4M3FNUZ, #mfma>) {
+    // CHECK-NOT: store
+    // CHECK-NOT: load
+
+    // CHECK: [[val3:%.*]] = llvm.extractvalue %arg0[3]
+    // CHECK: [[val7:%.*]] = llvm.extractvalue %arg0[7]
+
+    // CHECK-DAG: [[c16:%.*]] = llvm.mlir.constant(16 : i32)
+    // CHECK-DAG: [[c32:%.*]] = llvm.mlir.constant(32 : i32)
+    // CHECK-DAG: [[c48:%.*]] = llvm.mlir.constant(48 : i32)
+    // CHECK-DAG: [[c64:%.*]] = llvm.mlir.constant(64 : i32)
+
+    // CHECK: [[threadId:%.*]] = rocdl.workitem.id.x
+    // CHECK: [[laneId:%.*]] = llvm.urem [[threadId]], [[c64]]
+    // CHECK: [[mask0:%.*]] = llvm.icmp "slt" [[laneId]], [[c32]]
+
+    // CHECK: [[laneIdRem:%.*]] = llvm.urem [[laneId]], [[c32]]
+    // CHECK: [[mask1:%.*]] = llvm.icmp "slt" [[laneIdRem]], [[c16]]
+
+    // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c16]]
+    // CHECK: [[addr16:%.*]] = llvm.urem [[shflLaneId]], [[c64]]
+
+    // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c32]]
+    // CHECK: [[addr32:%.*]] = llvm.urem [[shflLaneId]], [[c64]]
+
+    // CHECK: [[shflLaneId:%.*]] = llvm.add [[laneId]], [[c48]]
+    // CHECK: [[addr48:%.*]] = llvm.urem [[shflLaneId]], [[c64]]
+
+    // CHECK: [[vec0:%.*]] = llvm.insertelement [[val3]], {{.*}} : vector<4xi8>
+    // CHECK: [[vec1:%.*]] = llvm.insertelement [[val7]], {{.*}} : vector<4xi8>
+
+    // CHECK: [[bvec0:%.*]] = llvm.bitcast [[vec0]]
+    // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32)
+    // CHECK: [[addr:%.*]] = llvm.shl [[addr16]], [[c2]]
+    // CHECK: [[bShflVec0_16:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec0]]
+    // CHECK: [[shflVec0_16:%.*]] = llvm.bitcast [[bShflVec0_16]]
+
+    // CHECK: [[bvec0:%.*]] = llvm.bitcast [[vec0]]
+    // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32)
+    // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]]
+    // CHECK: [[bShflVec0_32:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec0]]
+    // CHECK: [[shflVec0_32:%.*]] = llvm.bitcast [[bShflVec0_32]]
+
+    // CHECK: [[bvec1:%.*]] = llvm.bitcast [[vec1]]
+    // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32)
+    // CHECK: [[addr:%.*]] = llvm.shl [[addr32]], [[c2]]
+    // CHECK: [[bShflVec1_32:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec1]]
+    // CHECK: [[shflVec1_32:%.*]] = llvm.bitcast [[bShflVec1_32]]
+
+    // CHECK: [[bvec1:%.*]] = llvm.bitcast [[vec1]]
+    // CHECK: [[c2:%.*]] = llvm.mlir.constant(2 : i32)
+    // CHECK: [[addr:%.*]] = llvm.shl [[addr48]], [[c2]]
+    // CHECK: [[bShflVec1_48:%.*]] = rocdl.ds_bpermute [[addr]], [[bvec1]]
+    // CHECK: [[shflVec1_48:%.*]] = llvm.bitcast [[bShflVec1_48]]
+
+    // Input (8 values): (vec0, vec1)
+    // Output (8 values shuffled, '>> n' - take the value from (lane + n) % 64):
+    //                 resVec0     resVec1
+    //   lanes  0-15: (vec0      , vec0 >> 16) (mask0=1, mask1=1)
+    //   lanes 16-31: (vec0 >> 16, vec0 >> 32) (mask0=1, mask1=0)
+    //   lanes 32-47: (vec1 >> 32, vec1 >> 48) (mask0=0, mask1=1)
+    //   lanes 48-63: (vec1 >> 48, vec1      ) (mask0=0, mask1=0)
+
+    // CHECK-DAG: [[mask0_true:%.*]] = llvm.select [[mask1]], [[vec0]], [[shflVec0_16]] : i1, vector<4xi8>
+    // CHECK-DAG: [[mask0_false:%.*]] = llvm.select [[mask1]], [[shflVec1_32]], [[shflVec1_48]] : i1, vector<4xi8>
+    // CHECK: [[resVec0:%.*]] = llvm.select [[mask0]], [[mask0_true]], [[mask0_false]] : i1, vector<4xi8>
+
+    // CHECK-DAG: [[mask0_true:%.*]] = llvm.select [[mask1]], [[shflVec0_16]], [[shflVec0_32]] : i1, vector<4xi8>
+    // CHECK-DAG: [[mask0_false:%.*]] = llvm.select [[mask1]], [[shflVec1_48]], [[vec1]] : i1, vector<4xi8>
+    // CHECK: [[resVec1:%.*]] = llvm.select [[mask0]], [[mask0_true]], [[mask0_false]] : i1, vector<4xi8>
+
+    // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32)
+    // CHECK: [[resVal3:%.*]] = llvm.extractelement [[resVec0]][[[c3]] : i32] : vector<4xi8>
+    // CHECK: [[c3:%.*]] = llvm.mlir.constant(3 : i32) : i32
+    // CHECK: [[resVal7:%.*]] = llvm.extractelement [[resVec1]][[[c3]] : i32] : vector<4xi8>
+
+    // CHECK: llvm.insertvalue [[resVal3]], {{.*}}[3]
+    // CHECK: llvm.insertvalue [[resVal7]], {{.*}}[7]
+
+    // CHECK: llvm.return
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E4M3FNUZ, #mfma> -> tensor<128x32xf8E4M3FNUZ, #dotop0>
+    tt.return
+  }
+}
+
+// -----
+
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
+#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=8}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: mfma_dot_cvt_bf8_mfma16
+  tt.func public @mfma_dot_cvt_bf8_mfma16(%arg0: tensor<128x32xf8E5M2, #mfma>) {
+    // CHECK-NOT: store
+    // CHECK-NOT: load
+    // CHECK: rocdl.ds_bpermute
+    // CHECK: llvm.return
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x32xf8E5M2, #mfma> -> tensor<128x32xf8E5M2, #dotop0>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 208483beb8..3b61fb8cc4 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -116,6 +116,158 @@ struct LocalLoadOpConversion
   }
 };
 
+struct ConvertLayoutOpMFMAToDotOpConversion
+    : public ConvertOpToLLVMPattern<triton::gpu::ConvertLayoutOp> {
+public:
+  explicit ConvertLayoutOpMFMAToDotOpConversion(
+      LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
+      PatternBenefit benefit)
+      : ConvertOpToLLVMPattern<triton::gpu::ConvertLayoutOp>(typeConverter,
+                                                             benefit),
+        targetInfo(targetInfo) {}
+
+  LogicalResult
+  matchAndRewrite(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto srcType = cast<RankedTensorType>(op.getSrc().getType());
+    auto dstType = cast<RankedTensorType>(op.getType());
+
+    if (!matchMFMAAndDotOperandShuffleCase(srcType, dstType))
+      return failure();
+
+    auto loc = op.getLoc();
+
+    SmallVector<Value> inVals =
+        unpackLLElements(loc, adaptor.getSrc(), rewriter);
+    if (inVals.empty() || inVals.size() % 8 != 0)
+      return failure();
+
+    auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcType.getEncoding());
+    assert((mfmaLayout.getMDim() == 16 || mfmaLayout.getMDim() == 32) &&
+           "Expected MFMA size 16 or 32");
+    assert(triton::gpu::getWarpSize(mfmaLayout) == 64 &&
+           "Expected warp size 64 for MFMA");
+
+    auto elemTy = int_ty(8);
+    auto vecTy = vec_ty(elemTy, 4);
+
+    Value c16 = i32_val(16);
+    Value c32 = i32_val(32);
+    Value c48 = i32_val(48);
+    Value c64 = i32_val(64);
+
+    Value threadId = tid_val();
+    Value laneId = urem(threadId, c64);
+
+    Value mask0 = icmp_slt(laneId, c32);
+    Value mask1 = icmp_slt(urem(laneId, c32), c16);
+
+    Value addrShift16 = urem(add(laneId, c16), c64);
+    Value addrShift32 = urem(add(laneId, c32), c64);
+    Value addrShift48 = urem(add(laneId, c48), c64);
+
+    SmallVector<Value> outVals;
+    for (size_t startIdx = 0; startIdx < inVals.size(); startIdx += 8) {
+      Value vec0 = undef(vecTy);
+      for (size_t vIdx = 0; vIdx < 4; ++vIdx) {
+        vec0 =
+            insert_element(vecTy, vec0, inVals[startIdx + vIdx], i32_val(vIdx));
+      }
+      Value vec1 = undef(vecTy);
+      for (size_t vIdx = 0; vIdx < 4; ++vIdx) {
+        vec1 = insert_element(vecTy, vec1, inVals[startIdx + vIdx + 4],
+                              i32_val(vIdx));
+      }
+
+      Value resVec0, resVec1;
+      if (mfmaLayout.getMDim() == 32) {
+        /*
+        Using wave shuffle to convert layouts (32x32x16 case):
+        1) Input MMA layout (32x32, fp8, 16 values):
+         _____________________________________________________________
+        |(t0  v0 v1 v2 v3) (t32 v0 v1 v2 v3) ... (t32 v12 v13 v14 v15)|
+        | ...                                ...                      |
+        |(t31 v0 v1 v2 v3) (t63 v0 v1 v2 v3) ... (t63 v12 v13 v14 v15)|
+        |_____________________________________________________________|
+
+        2) Output Dot operand layout (two 32x16 tiles, fp8, 8 values each):
+         ____________________________________________________________  ___
+        |(t0  v0 v1 v2 v3 v4 v5 v6 v7) (t32 v0 v1 v2 v3 v4 v5 v6 v7) ||
+        | ...                           ...                          ||...
+        |(t31 v0 v1 v2 v3 v4 v5 v6 v7) (t63 v0 v1 v2 v3 v4 v5 v6 v7) ||
+        |____________________________________________________________||___
+        */
+
+        Value shflVec0 =
+            bitcast(targetInfo.shuffleIdx(
+                        rewriter, loc, bitcast(vec0, int_ty(32)), addrShift32),
+                    vecTy);
+        Value shflVec1 =
+            bitcast(targetInfo.shuffleIdx(
+                        rewriter, loc, bitcast(vec1, int_ty(32)), addrShift32),
+                    vecTy);
+
+        resVec0 = select(mask0, vec0, shflVec1);
+        resVec1 = select(mask0, shflVec0, vec1);
+      } else if (mfmaLayout.getMDim() == 16) {
+        /*
+        16x16x32 case:
+        1) Input MMA layout (two 16x16, fp8, 4 values each):
+         _________________________________________________________  ___________
+        |(t0  v0 v1 v2 v3) (t16 v0 v1 v2 v3) ... (t48 v0 v1 v2 v3)||(t0  v4 ...
+        | ...                                ...                  || ...
+        |(t15 v0 v1 v2 v3) (t31 v0 v1 v2 v3) ... (t63 v0 v1 v2 v3)||(t15 v4 ...
+        |_________________________________________________________||___________
+
+        2) Output Dot operand layout (16x32 tile, fp8, 8 values):
+         ________________________________________________________________
+        |(t0  v0 v1 v2 v3 v4 v5 v6 v7) ... (t48 v0 v1 v2 v3 v4 v5 v6 v7) |
+        | ...                          ...                               |
+        |(t15 v0 v1 v2 v3 v4 v5 v6 v7) ... (t63 v0 v1 v2 v3 v4 v5 v6 v7) |
+        |________________________________________________________________|
+        */
+
+        Value shflVec0_16 =
+            bitcast(targetInfo.shuffleIdx(
+                        rewriter, loc, bitcast(vec0, int_ty(32)), addrShift16),
+                    vecTy);
+        Value shflVec0_32 =
+            bitcast(targetInfo.shuffleIdx(
+                        rewriter, loc, bitcast(vec0, int_ty(32)), addrShift32),
+                    vecTy);
+        Value shflVec1_32 =
+            bitcast(targetInfo.shuffleIdx(
+                        rewriter, loc, bitcast(vec1, int_ty(32)), addrShift32),
+                    vecTy);
+        Value shflVec1_48 =
+            bitcast(targetInfo.shuffleIdx(
+                        rewriter, loc, bitcast(vec1, int_ty(32)), addrShift48),
+                    vecTy);
+
+        resVec0 = select(mask0, select(mask1, vec0, shflVec0_16),
+                         select(mask1, shflVec1_32, shflVec1_48));
+        resVec1 = select(mask0, select(mask1, shflVec0_16, shflVec0_32),
+                         select(mask1, shflVec1_48, vec1));
+      }
+
+      for (size_t vIdx = 0; vIdx < 4; ++vIdx) {
+        outVals.push_back(extract_element(elemTy, resVec0, i32_val(vIdx)));
+      }
+      for (size_t vIdx = 0; vIdx < 4; ++vIdx) {
+        outVals.push_back(extract_element(elemTy, resVec1, i32_val(vIdx)));
+      }
+    }
+
+    Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter,
+                                  op.getType());
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+protected:
+  const TargetInfoBase &targetInfo;
+};
+
 } // namespace
 
 namespace mlir::triton::AMD {
@@ -124,5 +276,7 @@ void populateConvertLayoutOpToLLVMPatterns(
     RewritePatternSet &patterns, int numWarps,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, PatternBenefit benefit) {
   patterns.add<LocalLoadOpConversion>(typeConverter, benefit);
+  patterns.add<ConvertLayoutOpMFMAToDotOpConversion>(typeConverter, targetInfo,
+                                                     benefit);
 }
 } // namespace mlir::triton::AMD

From 433037206d8870f0b82a3cd669097001084a29ed Mon Sep 17 00:00:00 2001
From: Mario Lezcano Casado <3291265+lezcano@users.noreply.github.com>
Date: Fri, 22 Nov 2024 11:44:34 +0000
Subject: [PATCH 07/12] [LAYOUTS] [BE] Simplify Ampere/Hopper paths introduced
 in #5189 (#5200)

We simplify the implementation of `getElemsPerThread` and strengthen the
preconditions of `getRepForOperand`.

More generally, we should try to minimise the calls to `isAmpere` and
`isHopper` throughout the codebase. I'll do a pass fixing many of these
once we land LLs for `ldmatrix` and Hopper.
---
 lib/Dialect/TritonGPU/IR/Dialect.cpp | 79 ++++++++++++++--------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 3273154fc7..2b1657652e 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1038,23 +1038,18 @@ DotOperandEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape,
     elemsPerThread[rank - 1] = (idx == 0) ? rep[2] * kWidth : rep[2];
     return elemsPerThread;
   } else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
-    if (mma.isAmpere() || mma.isHopper()) {
-      auto bitwidth = getPointeeType(eltTy).getIntOrFloatBitWidth();
-      auto rep = mma.getRepForOperand(shape, bitwidth, kWidth, idx);
-      auto sizePerThread = getSizePerThread();
-      auto elemsPerKRep = mma.isHopper() ? (kWidth * 2) : (32 / bitwidth * 2);
-      if (rank == 3)
-        elemsPerThread[0] = rep[0];
-      elemsPerThread[rank - 2] =
-          (idx == 0)
-              ? rep[1] * sizePerThread[rank - 2]
-              : std::max<int>(rep[1] * elemsPerKRep, sizePerThread[rank - 2]);
-      elemsPerThread[rank - 1] =
-          (idx == 0)
-              ? std::max<int>(rep[2] * elemsPerKRep, sizePerThread[rank - 1])
-              : rep[2] * sizePerThread[rank - 1];
-      return elemsPerThread;
+    assert(getCTALayout(*this) ==
+               CTALayoutAttr::getDefault(getContext(), rank) &&
+           "NYI");
+    auto sizePerThread = getSizePerThread();
+    auto threadsPerWarp = getThreadsPerWarp();
+    auto warpsPerCTA = getWarpsPerCTA();
+    SmallVector<unsigned> regs;
+    for (auto [n, nsize, nThread, nWarp] :
+         llvm::zip(shape, sizePerThread, threadsPerWarp, warpsPerCTA)) {
+      regs.push_back(std::max<int64_t>(nsize, n / (nThread * nWarp)));
     }
+    return regs;
   }
 
   llvm_unreachable("getElemsPerThread is not supported for dot operand");
@@ -2341,35 +2336,41 @@ NvidiaMmaEncodingAttr::getRepOrderForOperand(int opIdx) const {
 SmallVector<int64_t>
 NvidiaMmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> shape, int bitwidth,
                                         int kWidth, int opIdx) const {
+  assert(
+      kWidth >= 32 / bitwidth &&
+      "kWidth must be >= 32 / bitwidth for this function to be well-defined");
   auto rank = shape.size();
+  // Broadcast long K
   auto warpsPerCTA = getWarpsPerCTA();
+  auto kDim = opIdx == 0 ? rank - 1 : rank - 2;
+  warpsPerCTA[kDim] = 1;
 
-  // {batch, m, n, k}
-  // Hopper path never uses the n value, since this method is only invoked
-  // for in-RF (dotOpEnc) operands, but WGMMA only supports in A to be in RF
-  // TODO: rep per operand is not accurate for Hopper. It is currently done that
-  // way to allow us to get the correct total number of elements. this will be
-  // fixed when moving to linear layout.
-  SmallVector<int> shapePerWarp = {
-      1, 16, 8, isHopper() ? 4 * 2 * kWidth : 4 * 64 / bitwidth};
-  int numRepBatch =
-      rank == 3
-          ? std::max<int64_t>(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0]))
-          : 1;
-
+  SmallVector<int> tileSize;
+  if (rank == 3) {
+    tileSize.push_back(1);
+  }
   if (opIdx == 0) {
-    return {numRepBatch,
-            std::max<int64_t>(1, /*repM=*/shape[rank - 2] /
-                                     (shapePerWarp[1] * warpsPerCTA[rank - 2])),
-            std::max<int64_t>(1, /*repK=*/shape[rank - 1] / shapePerWarp[3])};
+    // m x k
+    tileSize.push_back(16);
+    tileSize.push_back(4 * 64 / bitwidth);
   } else {
-    assert(opIdx == 1);
-    return {
-        numRepBatch,
-        std::max<int64_t>(1, /*repK=*/shape[rank - 2] / shapePerWarp[3]),
-        std::max<int64_t>(1, /*repN=*/shape[rank - 1] /
-                                 (shapePerWarp[2] * warpsPerCTA[rank - 1]))};
+    // k x n
+    // Hopper path never uses the n value, since this method is only invoked
+    // for in-RF (dotOpEnc) operands, but WGMMA only supports in A to be in RF
+    // so it's fine if the n is incorrect here
+    tileSize.push_back(4 * 64 / bitwidth);
+    tileSize.push_back(8);
+  }
+
+  SmallVector<int64_t> numRep;
+  // Lezcano: This is odd. Why do we always return a vector of size 3?
+  if (rank != 3) {
+    numRep.push_back(1);
+  }
+  for (auto [s, size, warp] : llvm::zip(shape, tileSize, warpsPerCTA)) {
+    numRep.push_back(std::max<int64_t>(1, s / (size * warp)));
   }
+  return numRep;
 }
 
 SmallVector<unsigned>

From e5588383b9efc5bea8b0e8e42a8a9720c3c724c8 Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Fri, 22 Nov 2024 07:26:31 -0800
Subject: [PATCH 08/12] [BACKEND] Use LL to simplify redundant elements check
 and fix related issues (#5225)

---
 python/test/unit/language/test_core.py        |  21 +--
 .../LoadStoreOpToLLVM.cpp                     | 153 ++++++++----------
 2 files changed, 70 insertions(+), 104 deletions(-)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 23c598ee16..514ac171a3 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -5436,21 +5436,11 @@ def test_convertmma2mma(M, N, mma_pair, dtype, device, tmp_path: pathlib.Path):
             pytest.skip("Skip testing MMAv3 on devices with CC < 9")
 
     num_warps = np.cumprod(src_layout.warps_per_cta)[-1]
-    # TODO(Keren): Remove the intermediate layout once we have resolved the redundantDataMask issue for WGMMA
-    warps_per_cta = src_layout.warps_per_cta
-    interm = BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [warps_per_cta[0], warps_per_cta[1]], [0, 1], [1, 1],
-                           [1, 1], [0, 1])
 
     def do_test(src_layout, dst_layout):
         layouts = f"""
         #src = {src_layout}
         #dst = {dst_layout}
-        #interm = {interm}
-        """
-
-        conversion = f"""
-        %12 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst>
-        %13 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst>
         """
 
         ir = layouts + f"""
@@ -5460,6 +5450,7 @@ def do_test(src_layout, dst_layout):
         %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
         %1 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>
         %2 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<{M}x{N}x!tt.ptr<f16>, #src>
+        %3 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<{M}x{N}x!tt.ptr<f16>, #dst>
         %4 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> -> tensor<{M}x1xi32, #src>
         %5 = arith.muli %4, %cst : tensor<{M}x1xi32, #src>
         %6 = tt.expand_dims %1 {{axis = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>> -> tensor<1x{N}xi32, #src>
@@ -5468,12 +5459,10 @@ def do_test(src_layout, dst_layout):
         %9 = arith.addi %8, %7 : tensor<{M}x{N}xi32, #src>
         %10 = tt.addptr %2, %9 : tensor<{M}x{N}x!tt.ptr<f16>, #src>, tensor<{M}x{N}xi32, #src>
         %11 = tt.load %10 : tensor<{M}x{N}x!tt.ptr<f16>, #src>
-        %3 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<{M}x{N}x!tt.ptr<f16>, #interm>
-        """ + conversion + f"""
-        %15 = triton_gpu.convert_layout %12 : tensor<{M}x{N}xi32, #dst> -> tensor<{M}x{N}xi32, #interm>
-        %16 = triton_gpu.convert_layout %13 : tensor<{M}x{N}xf16, #dst> -> tensor<{M}x{N}xf16, #interm>
-        %17 = tt.addptr %3, %15 : tensor<{M}x{N}x!tt.ptr<f16>, #interm>, tensor<{M}x{N}xi32, #interm>
-        tt.store %17, %16 : tensor<{M}x{N}x!tt.ptr<f16>, #interm>
+        %12 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #src> -> tensor<{M}x{N}xi32, #dst>
+        %13 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #src> -> tensor<{M}x{N}xf16, #dst>
+        %14 = tt.addptr %3, %12 : tensor<{M}x{N}x!tt.ptr<f16>, #dst>, tensor<{M}x{N}xi32, #dst>
+        tt.store %14, %13 : tensor<{M}x{N}x!tt.ptr<f16>, #dst>
         tt.return
         }}
         }}
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
index cc52507121..d2cef405eb 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -8,6 +8,7 @@
 #include "Utility.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 
 using namespace mlir;
@@ -24,87 +25,57 @@ using ::mlir::triton::gpu::SharedEncodingAttr;
 namespace {
 
 // Return the mask for the unique data accessed by given tensor type.
-// Used to mask out the redundant data accessed by threads.
-Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,
-                        Location loc, const NVIDIA::TargetInfo &targetInfo) {
+// NOTE: Redundant memory load is allowed in triton, but redundant memory store
+// is not allowed.
+// mask = true: thread can write
+// mask = false: thread should not write
+Value getRedundantDataMask(ModuleOp moduleOp, Type valueTy,
+                           ConversionPatternRewriter &rewriter, Location loc,
+                           int regIdx, const NVIDIA::TargetInfo &targetInfo) {
+  auto ctx = moduleOp.getContext();
   auto tensorTy = dyn_cast<RankedTensorType>(valueTy);
-  Value mask = int_val(1, 1);
+  auto numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(moduleOp);
   auto tid = tid_val();
-  auto clusterCTAId = targetInfo.getClusterCTAId(rewriter, loc);
+  auto mask = true_val();
+  auto kReg = str_attr("register");
+  auto kLane = str_attr("lane");
+  auto kWarp = str_attr("warp");
+  auto kBlock = str_attr("block");
   if (tensorTy) {
-    auto layout = tensorTy.getEncoding();
     auto shape = tensorTy.getShape();
-    unsigned rank = shape.size();
-    auto sizePerThread = triton::gpu::getSizePerThread(layout);
-    auto threadsPerWarp = triton::gpu::getThreadsPerWarp(layout);
-    auto warpsPerCTA = triton::gpu::getWarpsPerCTA(layout);
-    auto threadOrder = triton::gpu::getThreadOrder(layout);
-    SmallVector<unsigned> warpOrder(rank);
-    if (auto enc = dyn_cast<DotOperandEncodingAttr>(layout)) {
-      warpOrder =
-          triton::gpu::getMatrixOrder(rank, /*rowMajor=*/enc.getOpIdx() == 1);
+    auto layout = tensorTy.getEncoding();
+    auto ll = triton::gpu::toLinearLayout(shape, layout);
+    assert(ll.has_value() && "Failed to convert layout to linear layout");
+    auto freeVariableMasks = ll->getFreeVariableMasks();
+    auto regMasks = freeVariableMasks[kReg];
+    if (regMasks & regIdx) {
+      // Step 1: check register redundancy
+      mask = false_val();
     } else {
-      warpOrder = triton::gpu::getWarpOrder(layout);
-    }
-    auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout);
-    Value warpSize = i32_val(32);
-    Value laneId = urem(tid, warpSize);
-    Value warpId = udiv(tid, warpSize);
-    // TODO: [DOT LL]
-    // The delinearize function is not entirely correct for certain layouts,
-    // such as wgmma. The correct approach is to convert a legacy layout to its
-    // corresponding linear layout and use the linear layout's
-    // getFreeVariableMasks to identify redundant elements.
-    SmallVector<Value> multiDimWarpId =
-        delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
-    SmallVector<Value> multiDimThreadId =
-        delinearize(rewriter, loc, laneId, threadsPerWarp, threadOrder);
-    for (unsigned dim = 0; dim < rank; ++dim) {
-      // if there is no data replication across threads on this dimension
-      if (shape[dim] >= shapePerCTATile[dim])
-        continue;
-      // Otherwise, we need to mask threads that will replicate data on this
-      // dimension. Calculate the thread index on this dimension for the CTA
-      Value threadDim =
-          add(mul(multiDimWarpId[dim], i32_val(threadsPerWarp[dim])),
-              multiDimThreadId[dim]);
-      mask = and_(mask, icmp_slt(mul(threadDim, i32_val(sizePerThread[dim])),
-                                 i32_val(shape[dim])));
-    }
-    // Do not write duplicated data when multicast is enabled
-    if (triton::gpu::getNumCTAs(layout) > 1) {
-      auto _0 = i32_val(0);
-      auto CTAsPerCGA = triton::gpu::getCTAsPerCGA(layout);
-      auto CTASplitNum = triton::gpu::getCTASplitNum(layout);
-      auto CTAOrder = triton::gpu::getCTAOrder(layout);
-
-      auto multiDimClusterCTAId =
-          delinearize(rewriter, loc, clusterCTAId, CTAsPerCGA, CTAOrder);
-
-      for (unsigned dim = 0; dim < rank; ++dim) {
-        // Skip when multicast is not enabled in this dimension
-        if (CTAsPerCGA[dim] == CTASplitNum[dim])
-          continue;
-        // This wrapping rule must be consistent with emitCTAOffsetForLayout
-        unsigned splitNum = std::min<unsigned>(shape[dim], CTASplitNum[dim]);
-        Value repId = udiv(multiDimClusterCTAId[dim], i32_val(splitNum));
-        // Consider the example where CTAsPerCGA = [4] and CTASplitNum = [2]:
-        //     CTA0 and CTA2 holds data of block0,
-        //     CTA1 and CTA3 holds data of block1.
-        // Only CTA0 and CTA1 are expected to write while CTA2 and CTA3 should
-        // be masked. We add the following mask:
-        //     multiDimClusterCTAId[dim] / splitNum == 0
-        // Actually in all existing cases of multicast, splitNum is always 1.
-        // The mask is equivalent to:
-        //     multiDimClusterCTAId[dim] == 0
-        mask = and_(mask, icmp_eq(repId, _0));
+      Value warpSize =
+          i32_val(triton::gpu::TritonGPUDialect::getThreadsPerWarp(moduleOp));
+      Value laneId = urem(tid, warpSize);
+      Value warpId = udiv(tid, warpSize);
+      // Step 2: check lane and warp redundancy
+      auto laneMasks = freeVariableMasks[kLane];
+      auto warpMasks = freeVariableMasks[kWarp];
+      mask = and_(mask, icmp_eq(and_(i32_val(laneMasks), laneId), i32_val(0)));
+      mask = and_(mask, icmp_eq(and_(i32_val(warpMasks), warpId), i32_val(0)));
+      if (numCTAs > 1) {
+        // Step 3: check block redundancy
+        auto ctaId = targetInfo.getClusterCTAId(rewriter, loc);
+        auto ctaMasks = freeVariableMasks[kBlock];
+        mask = and_(mask, icmp_eq(and_(i32_val(ctaMasks), ctaId), i32_val(0)));
       }
     }
   } else {
-    // If the tensor is not ranked, then it is a scalar and only thread 0 of
-    // CTA0 can write
-    mask = and_(mask, icmp_eq(clusterCTAId, i32_val(0)));
     mask = and_(mask, icmp_eq(tid, i32_val(0)));
+    if (numCTAs > 1) {
+      auto ctaId = targetInfo.getClusterCTAId(rewriter, loc);
+      // If the tensor is not ranked, then it is a scalar and only thread 0 of
+      // CTA0 within the cluster can write
+      mask = and_(mask, icmp_eq(ctaId, i32_val(0)));
+    }
   }
   return mask;
 }
@@ -264,7 +235,7 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern<triton::LoadOp>,
 
       PTXBuilder ptxBuilder;
 
-      Value pred = mask ? maskElems[vecStart] : int_val(1, 1);
+      Value pred = mask ? maskElems[vecStart] : true_val();
 
       const std::string readConstraint =
           (width == 64) ? "l" : ((width == 32) ? "r" : "c");
@@ -437,7 +408,7 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
                        << mask << "\n";
     }
 
-    Value mask = redundantDataMask(valueTy, rewriter, loc, targetInfo);
+    auto moduleOp = op->getParentOfType<ModuleOp>();
     const size_t dtsize =
         std::max<int>(1, valueElemTy.getIntOrFloatBitWidth() / 8);
     const size_t valueElemNBits = dtsize * 8;
@@ -485,6 +456,8 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
       PTXBuilder ptxBuilder;
       auto *asmArgList = ptxBuilder.newListOperand(asmArgs);
 
+      Value mask = getRedundantDataMask(moduleOp, valueTy, rewriter, loc,
+                                        vecStart, targetInfo);
       Value maskVal = llMask ? and_(mask, maskElems[vecStart]) : mask;
 
       auto *asmAddr =
@@ -577,7 +550,6 @@ struct AtomicCASOpConversion
                        << " origin vec = " << vecOrig
                        << " elemsPerThread = " << elemsPerThread << "\n";
 
-    Value mask = redundantDataMask(valueTy, rewriter, loc, targetInfo);
     auto vecTy = vec_ty(valueElemTy, vec);
     SmallVector<Value> resultVals(elemsPerThread);
 
@@ -607,6 +579,8 @@ struct AtomicCASOpConversion
       os << op.getSem();
       auto scope = stringifyMemSyncScope(op.getScope()).str();
       atom.global().o(semStr).o(scope).o("cas").o(sTy);
+      Value mask =
+          getRedundantDataMask(moduleOp, valueTy, rewriter, loc, i, targetInfo);
       atom(dstOpr, ptrOpr, cmpOpr, valOpr).predicate(mask);
 
       if (tensorTy) {
@@ -736,12 +710,12 @@ struct AtomicRMWOpConversion
                        << " packed = " << packed << " origin vec = " << vecOrig
                        << " numElems = " << numElems;
 
-    Value mask = redundantDataMask(valueTy, rewriter, loc, targetInfo);
-
     auto packedTy = vec_ty(valueElemTy, packed);
     SmallVector<Value> resultVals(elemsPerThread);
     for (size_t i = 0; i < elemsPerThread; i += vec * packed) {
       Value rmwPtr = ptrElements[i];
+      Value mask =
+          getRedundantDataMask(moduleOp, valueTy, rewriter, loc, i, targetInfo);
       Value rmwMask = llMask ? and_(mask, maskElements[i]) : mask;
       std::string sTy;
       PTXBuilder ptxBuilderAtomicRMW;
@@ -976,6 +950,7 @@ struct AsyncCopyGlobalToLocalOpConversion
              << vecBytes << " bytes";
     }
 
+    auto moduleOp = op->getParentOfType<ModuleOp>();
     for (int i = 0; i < shmemAddrs.size(); i++) {
       // It's possible that vecTy is larger than 128 bits, in which case we have
       // to use multiple cp.async instructions.
@@ -1003,24 +978,26 @@ struct AsyncCopyGlobalToLocalOpConversion
           // if there's any mask. cp.async will automatically fill the
           // remaining slots with 0 if cp-size > src-size.
           // XXX(Keren): Always assume other = 0 for now.
+          // When 'other != 0' is supported, we will need to fold the
+          // op.getMask() and redundantDataMask() into the same predicate, the
+          // way it is done for LoadOp.
           auto selectOp =
               select(maskElems[elemIdx], i32_val(wordBytes), i32_val(0));
           srcSize = ptxBuilder.newOperand(selectOp, "r");
         }
 
-        // When 'other != 0' is supported, we will need to fold the op.getMask()
-        // and redundantDataMask() into the same predicate, the way it is done
-        // for LoadOp.
-        Value maskVal = redundantDataMask(srcTy, rewriter, loc, targetInfo);
-
-        // TODO: Masking does not work for CTA multicast with cp.async. This is
-        // a quick and dirty workaround to avoid the issue.
         bool skipMaskForMultiCTA = triton::gpu::getNumCTAs(srcLayout) > 1;
-        if (!skipMaskForMultiCTA) {
-          copyAsyncOp(dstOperand, srcOperand, copySize, srcSize)
-              .predicate(maskVal);
-        } else {
+        if (skipMaskForMultiCTA) {
+          // TODO: Masking does not work for CTA multicast with cp.async.
+          // XXX(@peterbell10): In the multi-CTA mode, the redundant data might
+          // be on different CTAs which don't share the same smem address space,
+          // so we might need to load the same data multiple times.
           copyAsyncOp(dstOperand, srcOperand, copySize, srcSize);
+        } else {
+          Value mask = getRedundantDataMask(moduleOp, srcTy, rewriter, loc,
+                                            elemIdx, targetInfo);
+          copyAsyncOp(dstOperand, srcOperand, copySize, srcSize)
+              .predicate(mask);
         }
         ptxBuilder.launch(rewriter, loc, void_ty(getContext()));
       }

From bede39f56ff3f623f818849fde6f99ceb706eb83 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Fri, 22 Nov 2024 10:34:14 -0500
Subject: [PATCH 09/12] Make TMA tests compatible with older CUDA toolchains
 (#5221)

TMA fences require CUDA toolchain 12.3 or greater, but current gating
does not check the CUDA toolchain version. This causes
`test_experimental_tma.py` to fail when run with older CUDA toolchains.

## Before
With cuda-12.0:
```
55 failed, 9 passed in 18.11s
```

With cuda-12.4:
```
64 passed in 11.99s
```

## After
With cuda-12.0:
```
9 passed, 55 skipped in 4.26s
```

With cuda-12.4:
```
64 passed in 11.96s
```
---
 .../test/unit/hopper/test_experimental_tma.py  | 10 +++++++---
 python/triton/_internal_testing.py             | 18 +++++++++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/python/test/unit/hopper/test_experimental_tma.py b/python/test/unit/hopper/test_experimental_tma.py
index 7062093aef..23065953d6 100644
--- a/python/test/unit/hopper/test_experimental_tma.py
+++ b/python/test/unit/hopper/test_experimental_tma.py
@@ -4,7 +4,7 @@
 import triton
 import triton.language as tl
 from triton.tools.experimental_descriptor import (create_1d_tma_descriptor, create_2d_tma_descriptor)
-from triton._internal_testing import dtypes_with_bfloat16, numpy_random, to_triton, requires_tma
+from triton._internal_testing import dtypes_with_bfloat16, numpy_random, to_triton, requires_tma, supports_tma, tma_skip_msg
 
 from typing import Optional
 
@@ -29,9 +29,11 @@ def unwrap_tensor(t: torch.Tensor | triton.runtime.jit.TensorWrapper):
 tma_dtypes = sorted(set(dtypes_with_bfloat16) - {"int64", "uint64", "float64"})
 
 
-@requires_tma
 @pytest.mark.parametrize("byval_tma", [True, False])
 def test_experimetal_descriptor_load(byval_tma):
+    if not supports_tma(byval_tma):
+        pytest.skip(tma_skip_msg(byval_tma))
+
     device = "cuda"
     SIZE = 128
 
@@ -82,11 +84,13 @@ def matmul_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
     tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am, offs_bn])
 
 
-@requires_tma
 @pytest.mark.parametrize("num_stages", [1, 4])
 @pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(32, 32, 32), (128, 64, 64), (128, 128, 64), (128, 256, 64)])
 @pytest.mark.parametrize("byval_tma", [True, False])
 def test_experimental_tma_matmul(num_stages, BLOCK_M, BLOCK_N, BLOCK_K, byval_tma):
+    if not supports_tma(byval_tma):
+        pytest.skip(tma_skip_msg(byval_tma))
+
     device = "cuda"
     M, N, K = 8192, 8192, 1024
     torch.manual_seed(42)
diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py
index fa5df4f865..5ba0be1e34 100644
--- a/python/triton/_internal_testing.py
+++ b/python/triton/_internal_testing.py
@@ -4,6 +4,7 @@
 import torch
 import triton
 import triton.language as tl
+from triton.backends.nvidia.compiler import _path_to_binary
 import pytest
 
 from numpy.random import RandomState
@@ -140,8 +141,19 @@ def to_numpy(x):
         raise ValueError(f"Not a triton-compatible tensor: {x}")
 
 
-def supports_tma():
-    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+def supports_tma(byval_only=False):
+    _, cuda_version = _path_to_binary("ptxas")
+    min_cuda_version = (12, 0) if byval_only else (12, 3)
+    cuda_version_tuple = tuple(map(int, cuda_version.split(".")))
+    assert len(cuda_version_tuple) == 2, cuda_version_tuple
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9 and cuda_version_tuple >= min_cuda_version
+
+
+def tma_skip_msg(byval_only=False):
+    if byval_only:
+        return "Requires __grid_constant__ TMA support (NVIDIA Hopper or higher, CUDA 12.0 or higher)"
+    else:
+        return "Requires advanced TMA support (NVIDIA Hopper or higher, CUDA 12.3 or higher)"
 
 
-requires_tma = pytest.mark.skipif(not supports_tma(), reason="Requires TMA support (NVIDIA Hopper or higher)")
+requires_tma = pytest.mark.skipif(not supports_tma(), reason=tma_skip_msg())

From 03c63129d314bc2d112a802115e39fce3f17d2ec Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Fri, 22 Nov 2024 09:42:54 -0600
Subject: [PATCH 10/12] [CMake] Add C as project language (#5217)

If you build with `-DTRITON_BUILD_UT=OFF` on Mac you will get something
like

```
-- Looking for histedit.h
CMake Error at /opt/homebrew/Cellar/cmake/3.30.5/share/cmake/Modules/CheckIncludeFile.cmake:90 (try_compile):
  Unknown extension ".c" for file
-- Looking for histedit.h - not found

    /Users/runner/work/triton/triton/triton-build/CMakeFiles/CMakeScratch/TryCompile-QA06d6/CheckIncludeFile.c

  try_compile() works only for enabled languages.  Currently these are:

    CXX

  See project() command to enable other languages.
Call Stack (most recent call first):
  llvm-bd9145c8-macos-arm64/lib/cmake/llvm/FindLibEdit.cmake:28 (check_include_file)
  llvm-bd9145c8-macos-arm64/lib/cmake/llvm/LLVMConfig.cmake:177 (find_package)
  llvm-bd9145c8-macos-arm64/lib/cmake/mlir/MLIRConfig.cmake:10 (find_package)
```

because `C` isn't an enabled project language.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5aa40499e..a73f3ad716 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
-project(triton CXX)
+project(triton CXX C)
 include(CTest)
 
 if(NOT WIN32)

From 422e5d349592e06cdc50d2c5f34f65371138a31c Mon Sep 17 00:00:00 2001
From: Alexander Efimov <efimov.alexander@gmail.com>
Date: Fri, 22 Nov 2024 20:56:28 +0300
Subject: [PATCH 11/12] [AMD] Fix slow compilation due to inlining print calls
 (#5153)

This PR disables inline of print related functions, which speeds up
compilation of test_scan_layouts dramatically.

---------

Co-authored-by: Lei Zhang <antiagainst@gmail.com>
---
 python/src/llvm.cc                     |  2 --
 python/test/unit/language/test_core.py |  2 --
 third_party/amd/backend/compiler.py    |  3 +++
 third_party/amd/python/triton_amd.cc   | 18 ++++++++++++++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/src/llvm.cc b/python/src/llvm.cc
index f9b98a2540..182f79d783 100644
--- a/python/src/llvm.cc
+++ b/python/src/llvm.cc
@@ -139,8 +139,6 @@ std::string translateLLVMIRToASM(llvm::Module &module,
   {
     llvm::raw_string_ostream stream(result);
     llvm::buffer_ostream pstream(stream);
-    for (llvm::Function &f : module.functions())
-      f.addFnAttr(llvm::Attribute::AlwaysInline);
     llvm::legacy::PassManager pass;
     // emit
     auto fileType = isObject ? llvm::CodeGenFileType::ObjectFile
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 514ac171a3..e62373d6fb 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -2563,8 +2563,6 @@ def kernel(X, Y, N, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, NUM_PID_N: tl.
 @pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize("add_overflow_check", [False, True])
 def test_scan_layouts(M, N, src_layout, axis, add_overflow_check, device, tmp_path: pathlib.Path):
-    if add_overflow_check is True and is_hip():
-        pytest.skip("overflow check disabled on HIP while fixing issues")
 
     overflow_check = """
         %17 = arith.extsi %arg2 : i32 to i64
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
index c8c43a0512..c222be2cd6 100644
--- a/third_party/amd/backend/compiler.py
+++ b/third_party/amd/backend/compiler.py
@@ -342,6 +342,9 @@ def make_llir(src, metadata, options):
         metadata["shared"] = src.get_int_attr("triton_gpu.shared")
 
         amd.cleanup_bitcode_metadata(llvm_mod)
+        # Disable inlining of print related functions,
+        # because inlining of these function could slow down compilation significantly
+        amd.disable_print_inline(llvm_mod)
         return str(llvm_mod)
 
     @staticmethod
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
index a9bd3e9b7f..3c33509910 100644
--- a/third_party/amd/python/triton_amd.cc
+++ b/third_party/amd/python/triton_amd.cc
@@ -161,6 +161,24 @@ void init_triton_amd(py::module &&m) {
       module->eraseNamedMetadata(openclVersion);
   });
 
+  m.def("disable_print_inline", [](llvm::Module *module) {
+    // List of functions name prefixes we want to forbid inline.
+    std::array<const char *, 2> prefixes = {"__ockl_fprintf", "__ockl_printf"};
+
+    for (llvm::Function &f : module->functions()) {
+      if (!f.hasName())
+        continue;
+      llvm::StringRef name = f.getName();
+
+      auto isNamePrefixed = [&name](const char *prefix) {
+        return name.starts_with(prefix);
+      };
+
+      if (llvm::any_of(prefixes, isNamePrefixed))
+        f.addFnAttr(llvm::Attribute::NoInline);
+    }
+  });
+
   m.def(
       "assemble_amdgcn",
       [](const std::string &assembly, const std::string &arch,

From 16ce143b54eacf465c5a90a6aabdc9c3a723cb99 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Fri, 22 Nov 2024 11:16:23 -0800
Subject: [PATCH 12/12] [AMD] Re-enable overflow test in test_reduce_layouts
 (#5233)

https://github.com/triton-lang/triton/pull/5153 fixed
the issue; but we missed enabling one of the disabled
case.
---
 python/test/unit/language/test_core.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index e62373d6fb..b2e0fa59a8 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -2666,8 +2666,6 @@ def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, add_ov
         pytest.skip("Skipping because tensor shape is smaller than M(f)maLayout instr_shape")
     if is_hip() and isinstance(src_layout, MfmaLayout) and ((M, N) == (128, 128)):
         pytest.skip("Skipping test because it runs out of shared memory")
-    if add_overflow_check is True and is_hip():
-        pytest.skip("overflow check disabled on HIP while fixing issues")
     if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024:
         pytest.skip("Skipping sum reduction on float16 due to accuracy issues")