@@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131131def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132132def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133133def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
134+ def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr
134135
135136//
136137// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
691692 list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
692693 list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
693694 list<LLVMType> ArgsTy = !listconcat(
694- [llvm_shared_ptr_ty , // dst_smem_ptr
695- llvm_shared_ptr_ty, // mbarrier_smem_ptr
696- llvm_ptr_ty], // tensormap_ptr
697- TensorDimsTy, // actual tensor dims
698- Im2ColOffsetsTy, // im2col offsets
699- [llvm_i16_ty, // cta_mask
700- llvm_i64_ty, // cache_hint
701- llvm_i1_ty, // Flag for cta_mask
702- llvm_i1_ty] // Flag for cache_hint
695+ [llvm_dshared_ptr_ty , // dst_smem_ptr
696+ llvm_shared_ptr_ty, // mbarrier_smem_ptr
697+ llvm_ptr_ty], // tensormap_ptr
698+ TensorDimsTy, // actual tensor dims
699+ Im2ColOffsetsTy, // im2col offsets
700+ [llvm_i16_ty, // cta_mask
701+ llvm_i64_ty, // cache_hint
702+ llvm_i1_ty, // Flag for cta_mask
703+ llvm_i1_ty] // Flag for cache_hint
703704 );
704705
705706 int TempFlagsStartIdx = !add(dim, 5);
@@ -5118,7 +5119,7 @@ def int_nvvm_mapa
51185119 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51195120 "llvm.nvvm.mapa">;
51205121def int_nvvm_mapa_shared_cluster
5121- : DefaultAttrsIntrinsic<[llvm_shared_ptr_ty ], [llvm_shared_ptr_ty, llvm_i32_ty],
5122+ : DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty ], [llvm_shared_ptr_ty, llvm_i32_ty],
51225123 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51235124 "llvm.nvvm.mapa.shared.cluster">;
51245125def int_nvvm_getctarank
@@ -5218,14 +5219,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
52185219// From Global to Shared Cluster
52195220def int_nvvm_cp_async_bulk_global_to_shared_cluster
52205221 : DefaultAttrsIntrinsic<[],
5221- [llvm_shared_ptr_ty , // dst_smem_ptr
5222- llvm_shared_ptr_ty, // mbarrier_ptr
5223- llvm_global_ptr_ty, // src_gmem_ptr
5224- llvm_i32_ty, // copy_size
5225- llvm_i16_ty, // cta_mask
5226- llvm_i64_ty, // cache_hint
5227- llvm_i1_ty, // Flag for cta_mask
5228- llvm_i1_ty], // Flag for cache_hint
5222+ [llvm_dshared_ptr_ty , // dst_dsmem_ptr
5223+ llvm_shared_ptr_ty, // mbarrier_ptr
5224+ llvm_global_ptr_ty, // src_gmem_ptr
5225+ llvm_i32_ty, // copy_size
5226+ llvm_i16_ty, // cta_mask
5227+ llvm_i64_ty, // cache_hint
5228+ llvm_i1_ty, // Flag for cta_mask
5229+ llvm_i1_ty], // Flag for cache_hint
52295230 [IntrConvergent, IntrArgMemOnly,
52305231 WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52315232 NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5235,10 +5236,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
52355236// From Shared CTA to Shared Cluster
52365237def int_nvvm_cp_async_bulk_shared_cta_to_cluster
52375238 : DefaultAttrsIntrinsic<[],
5238- [llvm_shared_ptr_ty , // dst_smem_ptr
5239- llvm_shared_ptr_ty, // mbarrier_ptr
5240- llvm_shared_ptr_ty, // src_smem_ptr
5241- llvm_i32_ty], // copy_size
5239+ [llvm_dshared_ptr_ty , // dst_dsmem_ptr
5240+ llvm_shared_ptr_ty, // mbarrier_ptr
5241+ llvm_shared_ptr_ty, // src_smem_ptr
5242+ llvm_i32_ty], // copy_size
52425243 [IntrConvergent, IntrArgMemOnly,
52435244 WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52445245 NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
0 commit comments