diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index eed68155c7319..dca8fd9a0bca0 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -462,6 +462,143 @@ to left-shift the found bit into the most-significant bit position, otherwise the result is the shift amount needed to right-shift the found bit into the least-significant bit position. 0xffffffff is returned if no 1 bit is found. +TMA family of Intrinsics +------------------------ + +'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions. +These instructions initiate an asynchronous copy of tensor data from +global memory to shared::cluster memory (indicated by the ``g2s`` prefix) +in ``tile`` mode. In tile mode, the multi-dimensional layout of the +source tensor is preserved at the destination. The dimension of the +tensor data ranges from 1d to 5d with the coordinates specified +by the ``i32 %d0 ... i32 %d4`` arguments. + +* The last two arguments to these intrinsics are boolean flags + indicating support for cache_hint and/or multicast modifiers. + These flag arguments must be compile-time constants. The backend + looks through these flags and lowers the intrinsics appropriately. + +* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates + a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates + the presence of a multicast mask (``i16 %mc``) and generates the PTX + instruction with the ``.multicast::cluster`` modifier. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions. +These instructions initiate an asynchronous copy of tensor data from +global memory to shared::cluster memory (indicated by the ``g2s`` prefix) +in ``im2col`` mode. In im2col mode, some dimensions of the source tensor +are unrolled into a single dimensional column at the destination. In this +mode, the tensor has to be at least three-dimensional. Along with the tensor +coordinates, im2col offsets are also specified (denoted by +``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less +than the number of dimensions of the tensor operation. The last two arguments +to these intrinsics are boolean flags, with the same functionality as described +in the ``tile`` mode intrinsics above. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions. +These instructions initiate an asynchronous copy of tensor data from +shared::cta to global memory (indicated by the ``s2g`` prefix) +in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d +with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[3-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions. +These instructions initiate an asynchronous copy of tensor data from +shared::cta to global memory (indicated by the ``s2g`` prefix) +in ``im2col`` mode. In this mode, the tensor has to be at least +three-dimensional. Unlike the ``g2s`` variants, there are no +im2col_offsets for these intrinsics. The last argument to these +intrinsics is a boolean flag, with the same functionality as +described in the ``s2g.tile`` mode intrinsics above. + +For more information, refer PTX ISA +``_. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index fd0cbed8b2566..049d843015d5a 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -567,6 +567,52 @@ class SHFL_INFO { [OpType, llvm_i32_ty, llvm_i32_ty]); } +class CP_ASYNC_BULK_TENSOR_G2S_INTR { + string Name = "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim # "d"; + + bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0); + int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0); + list Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets); + list TensorDimsTy = !listsplat(llvm_i32_ty, dim); + list ArgsTy = !listconcat( + [llvm_shared_ptr_ty, // dst_smem_ptr + llvm_shared_ptr_ty, // mbarrier_smem_ptr + llvm_ptr_ty], // tensormap_ptr + TensorDimsTy, // actual tensor dims + Im2ColOffsetsTy, // im2col offsets + [llvm_i16_ty, // cta_mask + llvm_i64_ty, // cache_hint + llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty] // Flag for cache_hint + ); + + int TempFlagsStartIdx = !add(dim, 5); + int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets); + list IntrProp = [IntrConvergent, + WriteOnly>, ReadOnly>, + NoCapture>, NoCapture>, NoCapture>, + ImmArg>, + ImmArg>]; +} + +class CP_ASYNC_BULK_TENSOR_S2G_INTR { + string Name = "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # "d"; + + list TensorDimsTy = !listsplat(llvm_i32_ty, dim); + list ArgsTy = !listconcat( + [llvm_shared_ptr_ty, // src_smem_ptr + llvm_ptr_ty], // tensormap_ptr + TensorDimsTy, // actual tensor dims + [llvm_i64_ty, // cache_hint + llvm_i1_ty] // Flag for cache_hint + ); + int FlagsStartIdx = !add(dim, 3); + list IntrProp = [IntrConvergent, + ReadOnly>, ReadOnly>, + NoCapture>, NoCapture>, + ImmArg>]; +} + let TargetPrefix = "nvvm" in { def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], @@ -4847,4 +4893,16 @@ def int_nvvm_setmaxnreg_dec_sync_aligned_u32 def int_nvvm_exit : ClangBuiltin<"__nvvm_exit">, Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>; +// Intrinsics for Tensor Copy using TMA +// G2S -> From Global to Shared memory variants +// S2G -> From Shared to Global memory variants +foreach dim = [1, 2, 3, 4, 5] in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + foreach g2s = [CP_ASYNC_BULK_TENSOR_G2S_INTR] in + def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>; + foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR] in + def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>; + } +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 965ed98630a28..0c472c456bd5d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -164,6 +164,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryIntrinsicChain(N)) return; break; + case ISD::INTRINSIC_VOID: + if (tryIntrinsicVoid(N)) + return; + break; case NVPTXISD::Tex1DFloatS32: case NVPTXISD::Tex1DFloatFloat: case NVPTXISD::Tex1DFloatFloatLevel: @@ -4150,3 +4154,196 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { } bool NVPTXScopes::empty() const { return Scopes.size() == 0; } + +#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \ + (IsShared32 \ + ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \ + : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix) + +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \ + (IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, _CH)) \ + : (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, ))) + +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \ + [&]() -> auto { \ + if (IsMultiCast && IsCacheHint) \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC_CH); \ + if (IsCacheHint) \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _CH); \ + if (IsMultiCast) \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC); \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \ + }() + +static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, + bool IsCacheHint, bool IsIm2Col) { + if (IsIm2Col) { + switch (Dim) { + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL); + default: + llvm_unreachable("Invalid Dimension in im2col mode for " + "GetCpAsyncBulkTensorS2GOpcode."); + } + } else { + switch (Dim) { + case 1: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE); + case 2: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE); + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE); + default: + llvm_unreachable( + "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode."); + } + } +} + +static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, + bool IsMultiCast, + bool IsCacheHint, bool IsIm2Col) { + if (IsIm2Col) { + switch (Dim) { + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL); + default: + llvm_unreachable("Invalid Dimension in im2col mode for " + "GetCpAsyncBulkTensorG2SOpcode."); + } + } else { + switch (Dim) { + case 1: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE); + case 2: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE); + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE); + default: + llvm_unreachable( + "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode."); + } + } +} + +void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, + bool IsIm2Col) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2} + // multicast, cache_hint, + // multicast_flag, cache_hint_flag} + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {7 + dims + im2col_offsets} + auto getDimsFromIntrinsic = [](unsigned IID) { + switch (IID) { + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: + return 3; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: + return 4; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: + return 5; + default: + llvm_unreachable( + "Invalid im2col intrinsic in SelectCpAsyncBulkTensorG2SCommon."); + } + }; + size_t NumOps = N->getNumOperands(); + size_t NumDims = IsIm2Col ? getDimsFromIntrinsic(N->getConstantOperandVal(1)) + : (NumOps - 9); + // Offsets is always 'NumDims - 2' and only for im2col mode + size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1; + size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src} + size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumBaseArgs)); + + // Push MultiCast operand, if available + if (IsMultiCast) + Ops.push_back(N->getOperand(MultiCastIdx)); + + // Push CacheHint operand, if available + if (IsCacheHint) + Ops.push_back(N->getOperand(MultiCastIdx + 1)); + + // Finally, the chain operand + Ops.push_back(N->getOperand(0)); + + bool IsShared32 = + CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; + unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode( + NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + +void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N, + bool IsIm2Col) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {4 + dims} + size_t NumOps = N->getNumOperands(); + size_t NumDims = NumOps - 6; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(N->getOperand(0)); // Chain operand + + bool IsShared32 = + CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; + unsigned Opcode = + GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + +bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { + unsigned IID = N->getConstantOperandVal(1); + switch (IID) { + default: + return false; + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d: + SelectCpAsyncBulkTensorS2GCommon(N); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d: + SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: + SelectCpAsyncBulkTensorG2SCommon(N); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: + SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); + return true; + } +} diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c128c082c2983..6aa4e9f615a48 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -71,6 +71,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void Select(SDNode *N) override; bool tryIntrinsicNoChain(SDNode *N); bool tryIntrinsicChain(SDNode *N); + bool tryIntrinsicVoid(SDNode *N); void SelectTexSurfHandle(SDNode *N); bool tryLoad(SDNode *N); bool tryLoadVector(SDNode *N); @@ -91,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); + void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); + void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 1f4938d9fcf5a..536be22510703 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -498,6 +498,113 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ : [(int_nvvm_cp_async_bulk_wait_group_read (i32 timm:$n))]>, Requires<[hasPTX<80>, hasSM<90>]>; +//----------------------------------- +// TMA Async Tensor Copy Functions +//----------------------------------- + +// From Global to Shared memory (G2S) +class G2S_STRINGS { + string prefix = "cp.async.bulk.tensor"; + string dir = "shared::cluster.global"; + string completion = "mbarrier::complete_tx::bytes"; + string inst_name = prefix + # "." # dim # "d" + # "." # dir + # "." # mode + # "." # completion + # !if(mc, ".multicast::cluster", "") + # !if(ch, ".L2::cache_hint", ""); + string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_" + # dim # "D" + # !if(is_shared32, "_SHARED32", "") + # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); +} + +multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR { + defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); + defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar asm_str_default = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + defvar rc = !if(is_shared32, Int32Regs, Int64Regs); + + defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); + defvar im2col_dag = !if(!eq(mode, "im2col"), + !dag(ins, !listsplat(Int16Regs, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), + (ins)); + defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); + defvar im2col_asm_str = ", {{" # im2col_str # "}}"; + + defvar asm_str = !if(!eq(mode, "im2col"), + !strconcat(asm_str_default, im2col_asm_str), asm_str_default); + + def "": NVPTXInst<(outs), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag), + !strconcat(G2S_STRINGS.inst_name, asm_str, ";"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def _MC: NVPTXInst<(outs), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)), + !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def _CH: NVPTXInst<(outs), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), + !strconcat(G2S_STRINGS.inst_name, asm_str, ", $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def _MC_CH: NVPTXInst<(outs), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)), + !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc, $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} + +foreach dim = [1, 2, 3, 4, 5] in { + foreach shared32 = [true, false] in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + defm G2S_STRINGS.intr_name : + CP_ASYNC_BULK_TENSOR_G2S_INTR; + } + } +} + +// From Shared to Global memory (S2G) +class S2G_STRINGS { + string prefix = "cp.async.bulk.tensor"; + string dir = "global.shared::cta"; + string completion = "bulk_group"; + string inst_name = prefix + # "." # dim # "d" + # "." # dir + # "." # mode + # "." # completion + # !if(ch, ".L2::cache_hint", ""); + string intr_name = "CP_ASYNC_BULK_TENSOR_S2G_" + # dim # "D" + # !if(is_shared32, "_SHARED32", "") + # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); +} + +multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR { + defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); + defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; + defvar rc = !if(shared32, Int32Regs, Int64Regs); + + def "": NVPTXInst<(outs), + !con((ins rc:$src, Int64Regs:$tmap), dims_dag), + !strconcat(S2G_STRINGS.inst_name, asm_str, ";"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def _CH: NVPTXInst<(outs), + !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch)), + !strconcat(S2G_STRINGS.inst_name, asm_str, ", $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} + +foreach dim = [1, 2, 3, 4, 5] in { + foreach shared32 = [true, false] in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col_no_offs"], ["tile"]) in { + defm S2G_STRINGS.intr_name : + CP_ASYNC_BULK_TENSOR_S2G_INTR; + } + } +} + //----------------------------------- // MBarrier Functions //----------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll new file mode 100644 index 0000000000000..fd1a41a0dd1d2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -0,0 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2); + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_1d +define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 undef, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_2d +define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 undef, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_3d +define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_4d +define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_5d +define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2col_3d +define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2col_4d +define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2col_5d +define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 0, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 %ch, i1 0, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll new file mode 100644 index 0000000000000..50f5a2e82e230 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_1d +define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_2d +define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_3d +define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_4d +define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_5d +define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + ret void +}