diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index dca8fd9a0bca0..2152de9709dc6 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above. For more information, refer PTX ISA ``_. +'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set +of PTX instructions. These instructions initiate an asynchronous prefetch +of tensor data from global memory to the L2 cache. In tile mode, the +multi-dimensional layout of the source tensor is preserved at the destination. +The dimension of the tensor data ranges from 1d to 5d with the coordinates +specified by the ``i32 %d0 ... i32 %d4`` arguments. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set +of PTX instructions. These instructions initiate an asynchronous prefetch +of tensor data from global memory to the L2 cache. In im2col mode, some +dimensions of the source tensor are unrolled into a single dimensional +column at the destination. In this mode, the tensor has to be at least +three-dimensional. Along with the tensor coordinates, im2col offsets are +also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number +of im2col offsets is two less than the number of dimensions of the tensor +operation. The last argument to these intrinsics is a boolean flag, with +the same functionality as described in the ``tile`` mode intrinsics above. + +For more information, refer PTX ISA +``_. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 049d843015d5a..115fcee0b04f2 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -613,6 +613,28 @@ class CP_ASYNC_BULK_TENSOR_S2G_INTR { ImmArg>]; } +class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { + string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d"; + + bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0); + int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0); + list Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets); + list TensorDimsTy = !listsplat(llvm_i32_ty, dim); + list ArgsTy = !listconcat( + [llvm_ptr_ty], // tensormap_ptr + TensorDimsTy, // actual tensor dims + Im2ColOffsetsTy, // im2col offsets + [llvm_i64_ty, // cache_hint + llvm_i1_ty] // Flag for cache_hint + ); + + int TempFlagsStartIdx = !add(dim, 2); + int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets); + list IntrProp = [IntrConvergent, + ReadOnly>, NoCapture>, + ImmArg>]; +} + let TargetPrefix = "nvvm" in { def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], @@ -4902,6 +4924,8 @@ foreach dim = [1, 2, 3, 4, 5] in { def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>; foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR] in def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>; + foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR] in + def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0c472c456bd5d..2e7cf10d48cb6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -4175,6 +4175,10 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \ }() +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode) \ + (IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \ + : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode) + static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col) { if (IsIm2Col) { @@ -4242,6 +4246,55 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, } } +static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, + bool IsIm2Col) { + if (IsIm2Col) { + switch (Dim) { + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL); + default: + llvm_unreachable("Invalid Dimension in im2col mode for " + "GetCpAsyncBulkTensorPrefetchOpcode."); + } + } else { + switch (Dim) { + case 1: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE); + case 2: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE); + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE); + default: + llvm_unreachable("Invalid Dimension in tile mode for " + "GetCpAsyncBulkTensorPrefetchOpcode."); + } + } +} + +static size_t GetDimsFromIntrinsic(unsigned IID) { + switch (IID) { + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: + return 3; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: + return 4; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: + return 5; + default: + llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic."); + } +} + void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col) { // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: @@ -4250,21 +4303,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, // multicast_flag, cache_hint_flag} // NumOperands = {Chain, IID} + {Actual intrinsic args} // = {2} + {7 + dims + im2col_offsets} - auto getDimsFromIntrinsic = [](unsigned IID) { - switch (IID) { - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - return 3; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - return 4; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - return 5; - default: - llvm_unreachable( - "Invalid im2col intrinsic in SelectCpAsyncBulkTensorG2SCommon."); - } - }; size_t NumOps = N->getNumOperands(); - size_t NumDims = IsIm2Col ? getDimsFromIntrinsic(N->getConstantOperandVal(1)) + size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) : (NumOps - 9); // Offsets is always 'NumDims - 2' and only for im2col mode size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; @@ -4316,6 +4356,30 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } +void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, + bool IsIm2Col) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // {src, dims{d0...dN}, im2col_offsets{dims-2} + // cache_hint, cache_hint_flag} + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {3 + dims + im2col_offsets} + size_t NumOps = N->getNumOperands(); + size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) + : (NumOps - 5); + // Offsets is always 'NumDims - 2' and only for im2col mode + size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1); + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(N->getOperand(0)); // Chain operand + + unsigned Opcode = + GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); switch (IID) { @@ -4345,5 +4409,17 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d: + SelectCpAsyncBulkTensorPrefetchCommon(N); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: + SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true); + return true; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 6aa4e9f615a48..d6c80a31b7463 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -94,6 +94,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); + void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 536be22510703..5878940812f62 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -605,6 +605,52 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +// TMA Prefetch from Global memory to L2 cache +class PREFETCH_STRINGS { + string prefix = "cp.async.bulk.prefetch.tensor"; + string dir = "L2.global"; + string inst_name = prefix + # "." # dim # "d" + # "." # dir + # "." # mode + # !if(ch, ".L2::cache_hint", ""); + string intr_name = "CP_ASYNC_BULK_TENSOR_PREFETCH_" + # dim # "D" + # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); +} + +multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { + defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); + defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar asm_str_default = " [$tmap, {{" # dims_str # "}}]"; + + defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); + defvar im2col_dag = !if(!eq(mode, "im2col"), + !dag(ins, !listsplat(Int16Regs, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), + (ins)); + defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); + defvar im2col_asm_str = ", {{" # im2col_str # "}}"; + + defvar asm_str = !if(!eq(mode, "im2col"), + !strconcat(asm_str_default, im2col_asm_str), asm_str_default); + + def "": NVPTXInst<(outs), + !con((ins Int64Regs:$tmap), dims_dag, im2col_dag), + !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ";"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def _CH: NVPTXInst<(outs), + !con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), + !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ", $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} + +foreach dim = [1, 2, 3, 4, 5] in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + defm PREFETCH_STRINGS.intr_name : + CP_ASYNC_BULK_TENSOR_PREFETCH_INTR; + } +} + //----------------------------------- // MBarrier Functions //----------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll new file mode 100644 index 0000000000000..cb3b0c03f75d0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tm, i32 %d0, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1); + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_tile_1d +define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_tile_1d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_tile_2d +define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_tile_2d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_3d +define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2; +; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1}; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1}, %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_4d +define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2; +; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_5d +define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2; +; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1) + ret void +}