diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 25a230f65fd3d..a5a78a2882eec 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -553,6 +553,34 @@ it must be a multiple of 16. For more information, refer PTX ISA ``_. +'``llvm.nvvm.cp.async.bulk.prefetch.L2``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.prefetch.L2``' intrinsic +corresponds to the ``cp.async.bulk.prefetch.L2.*`` family +of PTX instructions. These instructions initiate an asynchronous +prefetch of bulk data from global memory to the L2 cache. +The 32-bit operand ``%size`` specifies the amount of memory to be +prefetched in terms of bytes and it must be a multiple of 16. + +* The last argument to these intrinsics is boolean flag indicating + support for cache_hint. These flag argument must be compile-time + constant. When set, it indicates a valid cache_hint (``i64 %ch``) + and generates the ``.L2::cache_hint`` variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 00a76018d8415..00c441920bfa1 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -5033,4 +5033,15 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global NoCapture>, NoCapture>, ImmArg>]>; +// Intrinsics for Bulk Copy Prefetch L2 +def int_nvvm_cp_async_bulk_prefetch_L2 + : DefaultAttrsIntrinsic<[], + [llvm_global_ptr_ty, // src_gmem_ptr + llvm_i32_ty, // copy_size + llvm_i64_ty, // cache_hint + llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, IntrArgMemOnly, + NoCapture>, ReadOnly>, + ImmArg>]>; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 2e66b67dfdcc7..cbceed1df68d6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -3105,6 +3105,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) { ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } +void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // src, size, cache_hint, cache_hint_flag + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {4} + size_t NumOps = N->getNumOperands(); + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(N->getOperand(0)); // Chain operand + + unsigned Opcode = IsCacheHint + ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH + : NVPTX::CP_ASYNC_BULK_PREFETCH; + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); using TMARedTy = llvm::nvvm::TMAReductionOp; @@ -3118,6 +3137,9 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global: SelectCpAsyncBulkS2G(N); return true; + case Intrinsic::nvvm_cp_async_bulk_prefetch_L2: + SelectCpAsyncBulkPrefetchL2(N); + return true; case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 8cadde8a82264..c673c83beba0f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -92,6 +92,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkG2S(SDNode *N); void SelectCpAsyncBulkS2G(SDNode *N); + void SelectCpAsyncBulkPrefetchL2(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 2d6ee2e28b4df..4b0858eab9490 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -547,6 +547,18 @@ multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER { defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER; defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER; +//------------------------------ +// Bulk Copy Prefetch Functions +//------------------------------ +def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs), + (ins Int64Regs:$src, Int32Regs:$size), + "cp.async.bulk.prefetch.L2.global [$src], $size;", []>, + Requires<[hasPTX<80>, hasSM<90>]>; + +def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs), + (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch), + "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>, + Requires<[hasPTX<80>, hasSM<90>]>; //------------------------------------- // TMA Async Bulk Tensor Copy Functions //------------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index aefd18a0632a0..cbb53df4a49b0 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -9,6 +9,7 @@ target triple = "nvptx64-nvidia-cuda" declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1) declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1) declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32) +declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1), i32, i64, i1) define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) { ; CHECK-PTX64-LABEL: cp_async_bulk_g2s( @@ -116,3 +117,21 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3 tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size) ret void } + +define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_prefetch( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0) + ret void +}