diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index dec6ad4e54115..56b11e6f81417 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -581,6 +581,46 @@ prefetched in terms of bytes and it must be a multiple of 16. For more information, refer PTX ISA ``_. +'``llvm.nvvm.prefetch.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr) + declare void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr) + + declare void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr) + declare void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr) + declare void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr) + declare void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr) + + declare void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr) + declare void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr) + + declare void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr) + +Overview: +""""""""" + +The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic +correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions. +The '``prefetch.*``' instructions bring the cache line containing the +specified address in global or local memory address space into the +specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line +containing the specified generic address into the specified uniform cache level. +If no address space is specified, it is assumed to be generic address. The intrinsic +uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier. + +* A prefetch to a shared memory location performs no operation. +* A prefetch into the uniform cache requires a generic address, + and no operation occurs if the address maps to a const, local, or shared memory location. + +For more information, refer to the PTX ISA +``_. + '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index abbe25bf0040a..45a3c5e540dd5 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -48,6 +48,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr +def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr // @@ -4999,6 +5000,26 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +// Intrinsics for Prefetch and Prefetchu +foreach level = ["L1", "L2"] in { + foreach addr = ["global", "local", ""] in { + foreach evict = !if(!eq(addr, "global"), ["evictlast", "evictnormal"], ["evictnormal"]) in { + defvar suffix = "" # !if(!eq(addr, ""), "", addr # "_") # level # "_" # evict; + def int_nvvm_prefetch_ # suffix : Intrinsic<[], + !cond( + !eq(addr, "global") : [llvm_global_ptr_ty], + !eq(addr, "local") : [llvm_local_ptr_ty], + !eq(addr, "") : [llvm_ptr_ty]), + [IntrArgMemOnly, ReadOnly>, + NoCapture>]>; + } + } +} + +def int_nvvm_prefetchu_L1_evictnormal : Intrinsic<[], [llvm_ptr_ty], + [IntrArgMemOnly, ReadOnly>, NoCapture>]>; + + // Intrinsics for Bulk Copy using TMA (non-tensor) // From Global to Shared Cluster def int_nvvm_cp_async_bulk_global_to_shared_cluster diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index a0d00e4aac560..bd378d990f719 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -740,6 +740,33 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +//Prefetch and Prefetchu +class Join lst> { + string ret = !foldl("", lst, a, b, !if(!eq(a, ""), b, !if(!eq(b,""), a, !strconcat(a, sep, b)))); +} + +class PREFETCH_INTRS : + NVPTXInst<(outs), (ins Int64Regs:$addr), + InstName # " [$addr];", + [(!cast(!strconcat("int_nvvm_", + !subst(".", "_", InstName))) i64:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + + +// Only global supports evictlast and evictnormal. +// Other variants (local and default) only support evictnormal +foreach level = ["L1", "L2"] in { + foreach addr = ["global", "local", ""] in { + foreach evict = !if(!eq(addr, "global"), ["evictlast", "evictnormal"], ["evictnormal"]) in { + defvar suffix = Join<"_", [addr, level, evict]>.ret; + defvar inst_name = "prefetch." # !subst("_", ".", suffix); + def PREFETCH_# suffix : PREFETCH_INTRS; + } + } +} + +def PREFETCHU_L1_EVICTNORMAL : PREFETCH_INTRS<"prefetchu.L1.evictnormal">; + //----------------------------------- // MBarrier Functions //----------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll new file mode 100644 index 0000000000000..cf47000ffd9aa --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr) +declare void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr) + +declare void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr) +declare void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr) +declare void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr) +declare void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr) + +declare void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr) +declare void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr) + +declare void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr) + +define void @prefetch_local(ptr addrspace(5) %local_ptr) { +; CHECK-PTX64-LABEL: prefetch_local( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_local_param_0]; +; CHECK-PTX64-NEXT: prefetch.local.L1.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: prefetch.local.L2.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr) + tail call void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr) + ret void +} + +define void @prefetch_global(ptr addrspace(1) %global_ptr) { +; CHECK-PTX64-LABEL: prefetch_global( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_global_param_0]; +; CHECK-PTX64-NEXT: prefetch.global.L1.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: prefetch.global.L2.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: prefetch.global.L1.evictlast [%rd1]; +; CHECK-PTX64-NEXT: prefetch.global.L2.evictlast [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr) + tail call void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr) + tail call void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr) + tail call void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr) + ret void +} + + +define void @prefetch_(ptr %ptr) { +; CHECK-PTX64-LABEL: prefetch_( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch__param_0]; +; CHECK-PTX64-NEXT: prefetch.L1.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: prefetch.L2.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr) + tail call void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr) + ret void +} + +define void @prefetchu_l1(ptr %ptr) { +; CHECK-PTX64-LABEL: prefetchu_l1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetchu_l1_param_0]; +; CHECK-PTX64-NEXT: prefetchu.L1.evictnormal [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr) + ret void +} \ No newline at end of file