diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index d28eb6860c33a..2dc8f9ff6a57f 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -971,6 +971,10 @@ Syntax: declare void @llvm.nvvm.prefetch.L1(ptr %ptr) declare void @llvm.nvvm.prefetch.L2(ptr %ptr) + declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) + declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) + declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr) declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr) @@ -983,7 +987,10 @@ The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions. The '``prefetch.*``' instructions bring the cache line containing the specified address in global or local memory address space into the -specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line +specified cache level (L1 or L2). If the '``.tensormap``' qualifier is specified then the +prefetch instruction brings the cache line containing the specified address in the +'``.const``' or '``.param memory``' state space for subsequent use by the '``cp.async.bulk.tensor``' +instruction. The '`prefetchu.*``' instruction brings the cache line containing the specified generic address into the specified uniform cache level. If no address space is specified, it is assumed to be generic address. The intrinsic uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier. diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 967d1663f237b..1bcc442a3f77f 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -137,6 +137,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr +def llvm_constant_ptr_ty: LLVMQualPointerType<4>; // (const)ptr def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr @@ -2212,15 +2213,17 @@ def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d // Intrinsics for Prefetch and Prefetchu let IntrProperties = [IntrArgMemOnly, ReadOnly>, NoCapture>] in { foreach level = ["L1", "L2"] in { - def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>; - def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>; - def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>; + def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>; } + def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>; + foreach eviction_priority = ["evict_normal", "evict_last"] in - def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>; - def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>; } // applypriority diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 15f45a1f35e2f..d599aeaa69204 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3908,6 +3908,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_prefetch_tensormap: { + auto &DL = I.getDataLayout(); + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = getPointerTy(DL); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; + Info.align.reset(); + return true; + } + case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d33719236b172..d4a0ca794cd88 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -39,6 +39,12 @@ def AS_match { code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; + code const = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST); + }]; + code param = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM); + }]; } @@ -950,33 +956,47 @@ foreach dim = 3...5 in { defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", [hasTMACTAGroupSupport]>; -//Prefetch and Prefetchu - -let Predicates = [hasPTX<80>, hasSM<90>] in { - class PREFETCH_INTRS : - BasicNVPTXInst<(outs), (ins ADDR:$addr), - InstName, - [(!cast(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) addr:$addr)]>; +//Prefetchu and Prefetch - def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; - def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; - def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; - def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; - def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; - def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; +defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr); - def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_normal", - [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>; +multiclass PREFETCH_TENSORMAP_PATFRAG { + def !tolower(suffix) : PatFrag; +} - def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_last", - [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>; - def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; +multiclass PREFETCH_TENSORMAP_INST { + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch" # addrspace_name # ".tensormap", + [(pattern_frag addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; } +defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>; +defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>; +defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>; + +class PREFETCH_INTRS : + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, + [(Intr addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + +def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>; +def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>; +def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>; +def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>; +def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>; +def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>; +def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>; +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", + int_nvvm_prefetch_global_L2_evict_normal>; +def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", + int_nvvm_prefetch_global_L2_evict_last>; + //Applypriority intrinsics class APPLYPRIORITY_L2_INTRS : BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size), diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3ae2d9d5181a3..f4f89613b358d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, case Intrinsic::nvvm_isspacep_global: case Intrinsic::nvvm_isspacep_local: case Intrinsic::nvvm_isspacep_shared: - case Intrinsic::nvvm_isspacep_shared_cluster: { + case Intrinsic::nvvm_isspacep_shared_cluster: + case Intrinsic::nvvm_prefetch_tensormap: { OpIndexes.push_back(0); return true; } @@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return ConstantInt::get(II->getType(), *R); return nullptr; } + case Intrinsic::nvvm_prefetch_tensormap: { + IRBuilder<> Builder(II); + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap, + NewV); + } } return nullptr; } diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll new file mode 100644 index 0000000000000..822a75eab24c2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll @@ -0,0 +1,78 @@ +; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify %} + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +@constant_tensormap = addrspace(4) global [64 x i8] zeroinitializer, align 64 + +; Inference from const address space +define void @test_infer_const_from_cast() { +; INFER-LABEL: @test_infer_const_from_cast +; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap) +; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap) +; PTX-LABEL: .visible .func test_infer_const_from_cast( +; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap; +; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}; +; PTX: prefetch.tensormap [%rd{{[0-9]+}}]; +entry: + %casted = addrspacecast ptr addrspace(4) @constant_tensormap to ptr + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %casted) + ret void +} + +; Cast from Const space to Generic +define void @test_const_to_generic_cast(ptr addrspace(4) %const_ptr) { +; INFER-LABEL: @test_const_to_generic_cast +; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) +; PTX-LABEL: .visible .func test_const_to_generic_cast( +; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}]; +entry: + %cast = addrspacecast ptr addrspace(4) %const_ptr to ptr + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast) + ret void +} + +; No inference possible +define void @test_no_inference_possible(ptr %generic_ptr) { +; INFER-LABEL: @test_no_inference_possible +; INFER: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr) +; PTX-LABEL: .visible .func test_no_inference_possible( +; PTX: prefetch.tensormap [%rd{{[0-9]+}}]; +entry: + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr) + ret void +} + +; Cast from Parameter space to Generic +define void @test_param_to_generic_cast(ptr addrspace(101) %param_ptr) { +; INFER-LABEL: @test_param_to_generic_cast +; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) +; PTX-LABEL: .visible .func test_param_to_generic_cast( +; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}]; +entry: + %cast = addrspacecast ptr addrspace(101) %param_ptr to ptr + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast) + ret void +} + +; Multiple casts in sequence +define void @test_infer_through_multiple_casts() { +; INFER-LABEL: @test_infer_through_multiple_casts +; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap) +; PTX-LABEL: .visible .func test_infer_through_multiple_casts( +; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap; +; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}; +; PTX: prefetch.tensormap [%rd{{[0-9]+}}]; +entry: + %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr + %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4) + %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr + call void @llvm.nvvm.prefetch.tensormap(ptr %cast3) + ret void +} + +declare void @llvm.nvvm.prefetch.tensormap.p0(ptr) +declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4)) +declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101)) diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll index a64e4fe7a508e..862e26d704679 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -12,6 +12,10 @@ declare void @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr) declare void @llvm.nvvm.prefetch.L1(ptr %ptr) declare void @llvm.nvvm.prefetch.L2(ptr %ptr) +declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) +declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) +declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr) declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr) @@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) { ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr) ret void +} + +define void @prefetch_tensormap(ptr %ptr) { +; CHECK-PTX64-LABEL: prefetch_tensormap( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_tensormap_param_0]; +; CHECK-PTX64-NEXT: prefetch.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) + ret void +} + +define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) { +; CHECK-PTX64-LABEL: prefetch_const_tensormap( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_const_tensormap_param_0]; +; CHECK-PTX64-NEXT: prefetch.const.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) + ret void +} + +define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) { +; CHECK-PTX64-LABEL: prefetch_param_tensormap( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_param_tensormap_param_0]; +; CHECK-PTX64-NEXT: prefetch.param.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + ret void } \ No newline at end of file