@@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131131def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132132def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133133def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
134+ def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr
134135
135136//
136137// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
691692 list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
692693 list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
693694 list<LLVMType> ArgsTy = !listconcat(
694- [llvm_shared_ptr_ty , // dst_smem_ptr
695- llvm_shared_ptr_ty, // mbarrier_smem_ptr
696- llvm_ptr_ty], // tensormap_ptr
697- TensorDimsTy, // actual tensor dims
698- Im2ColOffsetsTy, // im2col offsets
699- [llvm_i16_ty, // cta_mask
700- llvm_i64_ty, // cache_hint
701- llvm_i1_ty, // Flag for cta_mask
702- llvm_i1_ty] // Flag for cache_hint
695+ [llvm_dshared_ptr_ty , // dst_smem_ptr
696+ llvm_shared_ptr_ty, // mbarrier_smem_ptr
697+ llvm_ptr_ty], // tensormap_ptr
698+ TensorDimsTy, // actual tensor dims
699+ Im2ColOffsetsTy, // im2col offsets
700+ [llvm_i16_ty, // cta_mask
701+ llvm_i64_ty, // cache_hint
702+ llvm_i1_ty, // Flag for cta_mask
703+ llvm_i1_ty] // Flag for cache_hint
703704 );
704705
705706 int TempFlagsStartIdx = !add(dim, 5);
@@ -5087,7 +5088,7 @@ def int_nvvm_mapa
50875088 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
50885089 "llvm.nvvm.mapa">;
50895090def int_nvvm_mapa_shared_cluster
5090- : DefaultAttrsIntrinsic<[llvm_shared_ptr_ty ], [llvm_shared_ptr_ty, llvm_i32_ty],
5091+ : DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty ], [llvm_shared_ptr_ty, llvm_i32_ty],
50915092 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
50925093 "llvm.nvvm.mapa.shared.cluster">;
50935094def int_nvvm_getctarank
@@ -5187,14 +5188,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
51875188// From Global to Shared Cluster
51885189def int_nvvm_cp_async_bulk_global_to_shared_cluster
51895190 : DefaultAttrsIntrinsic<[],
5190- [llvm_shared_ptr_ty , // dst_smem_ptr
5191- llvm_shared_ptr_ty, // mbarrier_ptr
5192- llvm_global_ptr_ty, // src_gmem_ptr
5193- llvm_i32_ty, // copy_size
5194- llvm_i16_ty, // cta_mask
5195- llvm_i64_ty, // cache_hint
5196- llvm_i1_ty, // Flag for cta_mask
5197- llvm_i1_ty], // Flag for cache_hint
5191+ [llvm_dshared_ptr_ty , // dst_dsmem_ptr
5192+ llvm_shared_ptr_ty, // mbarrier_ptr
5193+ llvm_global_ptr_ty, // src_gmem_ptr
5194+ llvm_i32_ty, // copy_size
5195+ llvm_i16_ty, // cta_mask
5196+ llvm_i64_ty, // cache_hint
5197+ llvm_i1_ty, // Flag for cta_mask
5198+ llvm_i1_ty], // Flag for cache_hint
51985199 [IntrConvergent, IntrArgMemOnly,
51995200 WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52005201 NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5204,10 +5205,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
52045205// From Shared CTA to Shared Cluster
52055206def int_nvvm_cp_async_bulk_shared_cta_to_cluster
52065207 : DefaultAttrsIntrinsic<[],
5207- [llvm_shared_ptr_ty , // dst_smem_ptr
5208- llvm_shared_ptr_ty, // mbarrier_ptr
5209- llvm_shared_ptr_ty, // src_smem_ptr
5210- llvm_i32_ty], // copy_size
5208+ [llvm_dshared_ptr_ty , // dst_dsmem_ptr
5209+ llvm_shared_ptr_ty, // mbarrier_ptr
5210+ llvm_shared_ptr_ty, // src_smem_ptr
5211+ llvm_i32_ty], // copy_size
52115212 [IntrConvergent, IntrArgMemOnly,
52125213 WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52135214 NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
0 commit comments