diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 675b458c41e7b..61ae07816bdfd 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -630,6 +630,30 @@ uses and eviction priority which can be accessed by the '``.level::eviction_prio For more information, refer to the PTX ISA ``_. +'``llvm.nvvm.applypriority.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 %size) + declare void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 %size) + +Overview: +""""""""" + +The '``@llvm.nvvm.applypriority.*``' applies the cache eviction priority specified by the +.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache +level. If no state space is specified then Generic Addressing is used. If the specified address +does not fall within the address window of .global state space then the behavior is undefined. +The operand size is an integer constant that specifies the amount of data, in bytes, in the specified cache +level on which the priority is to be applied. The only supported value for the size operand is 128. + +For more information, refer to the PTX ISA +``_. + '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index c32bf0318b5d6..12d43594a3df2 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -5043,6 +5043,16 @@ def int_nvvm_prefetch_global_L2_evict_last: Intrinsic<[], [llvm_global_ptr_ty], def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly, ReadOnly>, NoCapture>]>; +def int_nvvm_applypriority_global_L2_evict_normal + : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly, ReadOnly>, NoCapture>, + ImmArg>]>; + +def int_nvvm_applypriority_L2_evict_normal + : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly, ReadOnly>, NoCapture>, + ImmArg>]>; + // Intrinsics for Bulk Copy using TMA (non-tensor) // From Global to Shared Cluster diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index ed7963f35a7c7..1fd95abfab217 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -789,6 +789,17 @@ def PREFETCH_GLOBAL_L2_EVICT_LAST : NVPTXInst<(outs), (ins Int64Regs:$addr), def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; +//Applypriority intrinsics +class APPLYPRIORITY_L2_INTRS : + NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size), + StrJoin<".", ["applypriority", addr , "L2::evict_normal"]>.ret # " [$addr], $size;", + [(!cast(StrJoin<"_", ["int_nvvm_applypriority", addr , "L2_evict_normal"]>.ret) + i64:$addr, i64:$size)]>, + Requires<[hasPTX<74>, hasSM<80>]>; + +def APPLYPRIORITY_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"">; +def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">; + //----------------------------------- // MBarrier Functions //----------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/applypriority.ll b/llvm/test/CodeGen/NVPTX/applypriority.ll new file mode 100644 index 0000000000000..af161d82a25ea --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/applypriority.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 %size) +declare void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 %size) + +define void @applypriority_global_L2(ptr addrspace(1) %global_ptr, i64 %size) { +; CHECK-PTX64-LABEL: applypriority_global_L2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_global_L2_param_0]; +; CHECK-PTX64-NEXT: applypriority.global.L2::evict_normal [%rd1], 128; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 128) + ret void +} + +define void @applypriority_L2(ptr %ptr, i64 %size) { +; CHECK-PTX64-LABEL: applypriority_L2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_L2_param_0]; +; CHECK-PTX64-NEXT: applypriority.L2::evict_normal [%rd1], 128; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 128) + ret void +}