Skip to content

Commit fee6e53

Browse files
authored
[NVPTX] Add prefetch tensormap variant (#146203)
[NVPTX] Add Prefetch tensormap intrinsics This PR adds prefetch intrinsics with the relevant tensormap_space. * Lit tests are added as part of prefetch.ll * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst. For more information, refer to the PTX ISA for prefetch intrinsic : [Prefetch Tensormap](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu) @durga4github @schwarzschild-radius
1 parent 4784585 commit fee6e53

File tree

7 files changed

+197
-28
lines changed

7 files changed

+197
-28
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,10 @@ Syntax:
971971
declare void @llvm.nvvm.prefetch.L1(ptr %ptr)
972972
declare void @llvm.nvvm.prefetch.L2(ptr %ptr)
973973
974+
declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
975+
declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
976+
declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
977+
974978
declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
975979
declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
976980
@@ -983,7 +987,10 @@ The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic
983987
correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions.
984988
The '``prefetch.*``' instructions bring the cache line containing the
985989
specified address in global or local memory address space into the
986-
specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line
990+
specified cache level (L1 or L2). If the '``.tensormap``' qualifier is specified then the
991+
prefetch instruction brings the cache line containing the specified address in the
992+
'``.const``' or '``.param memory``' state space for subsequent use by the '``cp.async.bulk.tensor``'
993+
instruction. The '`prefetchu.*``' instruction brings the cache line
987994
containing the specified generic address into the specified uniform cache level.
988995
If no address space is specified, it is assumed to be generic address. The intrinsic
989996
uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier.

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@
137137

138138
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
139139
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
140+
def llvm_constant_ptr_ty: LLVMQualPointerType<4>; // (const)ptr
140141
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
141142
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
142143
def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
@@ -2212,15 +2213,17 @@ def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d
22122213
// Intrinsics for Prefetch and Prefetchu
22132214
let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in {
22142215
foreach level = ["L1", "L2"] in {
2215-
def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>;
2216-
def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>;
2217-
def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>;
2216+
def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
2217+
def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
2218+
def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>;
22182219
}
22192220

2221+
def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>;
2222+
22202223
foreach eviction_priority = ["evict_normal", "evict_last"] in
2221-
def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>;
2224+
def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
22222225

2223-
def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>;
2226+
def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
22242227
}
22252228

22262229
// applypriority

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4046,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
40464046
return true;
40474047
}
40484048

4049+
case Intrinsic::nvvm_prefetch_tensormap: {
4050+
auto &DL = I.getDataLayout();
4051+
Info.opc = ISD::INTRINSIC_VOID;
4052+
Info.memVT = getPointerTy(DL);
4053+
Info.ptrVal = I.getArgOperand(0);
4054+
Info.offset = 0;
4055+
Info.flags =
4056+
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
4057+
Info.align.reset();
4058+
return true;
4059+
}
4060+
40494061
case Intrinsic::nvvm_ldu_global_i:
40504062
case Intrinsic::nvvm_ldu_global_f:
40514063
case Intrinsic::nvvm_ldu_global_p: {

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ def AS_match {
3939
code global = [{
4040
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
4141
}];
42+
code const = [{
43+
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
44+
}];
45+
code param = [{
46+
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
47+
}];
4248
}
4349

4450

@@ -950,33 +956,47 @@ foreach dim = 3...5 in {
950956
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
951957
[hasTMACTAGroupSupport]>;
952958

953-
//Prefetch and Prefetchu
954-
955-
let Predicates = [hasPTX<80>, hasSM<90>] in {
956-
class PREFETCH_INTRS<string InstName> :
957-
BasicNVPTXInst<(outs), (ins ADDR:$addr),
958-
InstName,
959-
[(!cast<Intrinsic>(!strconcat("int_nvvm_",
960-
!subst(".", "_", InstName))) addr:$addr)]>;
959+
//Prefetchu and Prefetch
961960

962-
def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
963-
def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
964-
def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
965-
def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">;
966-
def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
967-
def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
961+
defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);
968962

969-
def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
970-
"prefetch.global.L2::evict_normal",
971-
[(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>;
963+
multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
964+
def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
965+
}
972966

973-
def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr),
974-
"prefetch.global.L2::evict_last",
975-
[(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>;
967+
defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
968+
defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
969+
defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;
976970

977-
def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
971+
multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
972+
def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
973+
"prefetch" # addrspace_name # ".tensormap",
974+
[(pattern_frag addr:$addr)]>,
975+
Requires<[hasPTX<80>, hasSM<90>]>;
978976
}
979977

978+
defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
979+
defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
980+
defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;
981+
982+
class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
983+
BasicNVPTXInst<(outs), (ins ADDR:$addr),
984+
InstName,
985+
[(Intr addr:$addr)]>,
986+
Requires<[hasPTX<80>, hasSM<90>]>;
987+
988+
def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;
989+
def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
990+
def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
991+
def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
992+
def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
993+
def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
994+
def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
995+
def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal",
996+
int_nvvm_prefetch_global_L2_evict_normal>;
997+
def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last",
998+
int_nvvm_prefetch_global_L2_evict_last>;
999+
9801000
//Applypriority intrinsics
9811001
class APPLYPRIORITY_L2_INTRS<string addrspace> :
9821002
BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size),

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
564564
case Intrinsic::nvvm_isspacep_global:
565565
case Intrinsic::nvvm_isspacep_local:
566566
case Intrinsic::nvvm_isspacep_shared:
567-
case Intrinsic::nvvm_isspacep_shared_cluster: {
567+
case Intrinsic::nvvm_isspacep_shared_cluster:
568+
case Intrinsic::nvvm_prefetch_tensormap: {
568569
OpIndexes.push_back(0);
569570
return true;
570571
}
@@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
587588
return ConstantInt::get(II->getType(), *R);
588589
return nullptr;
589590
}
591+
case Intrinsic::nvvm_prefetch_tensormap: {
592+
IRBuilder<> Builder(II);
593+
return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
594+
NewV);
595+
}
590596
}
591597
return nullptr;
592598
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
3+
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify %}
4+
5+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
6+
target triple = "nvptx64-unknown-unknown"
7+
8+
@constant_tensormap = addrspace(4) global [64 x i8] zeroinitializer, align 64
9+
10+
; Inference from const address space
11+
define void @test_infer_const_from_cast() {
12+
; INFER-LABEL: @test_infer_const_from_cast
13+
; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
14+
; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
15+
; PTX-LABEL: .visible .func test_infer_const_from_cast(
16+
; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
17+
; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
18+
; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
19+
entry:
20+
%casted = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
21+
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %casted)
22+
ret void
23+
}
24+
25+
; Cast from Const space to Generic
26+
define void @test_const_to_generic_cast(ptr addrspace(4) %const_ptr) {
27+
; INFER-LABEL: @test_const_to_generic_cast
28+
; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
29+
; PTX-LABEL: .visible .func test_const_to_generic_cast(
30+
; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}];
31+
entry:
32+
%cast = addrspacecast ptr addrspace(4) %const_ptr to ptr
33+
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
34+
ret void
35+
}
36+
37+
; No inference possible
38+
define void @test_no_inference_possible(ptr %generic_ptr) {
39+
; INFER-LABEL: @test_no_inference_possible
40+
; INFER: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
41+
; PTX-LABEL: .visible .func test_no_inference_possible(
42+
; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
43+
entry:
44+
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
45+
ret void
46+
}
47+
48+
; Cast from Parameter space to Generic
49+
define void @test_param_to_generic_cast(ptr addrspace(101) %param_ptr) {
50+
; INFER-LABEL: @test_param_to_generic_cast
51+
; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
52+
; PTX-LABEL: .visible .func test_param_to_generic_cast(
53+
; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}];
54+
entry:
55+
%cast = addrspacecast ptr addrspace(101) %param_ptr to ptr
56+
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
57+
ret void
58+
}
59+
60+
; Multiple casts in sequence
61+
define void @test_infer_through_multiple_casts() {
62+
; INFER-LABEL: @test_infer_through_multiple_casts
63+
; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
64+
; PTX-LABEL: .visible .func test_infer_through_multiple_casts(
65+
; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
66+
; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
67+
; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
68+
entry:
69+
%cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
70+
%cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
71+
%cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
72+
call void @llvm.nvvm.prefetch.tensormap(ptr %cast3)
73+
ret void
74+
}
75+
76+
declare void @llvm.nvvm.prefetch.tensormap.p0(ptr)
77+
declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4))
78+
declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101))

llvm/test/CodeGen/NVPTX/prefetch.ll

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ declare void @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr)
1212
declare void @llvm.nvvm.prefetch.L1(ptr %ptr)
1313
declare void @llvm.nvvm.prefetch.L2(ptr %ptr)
1414

15+
declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
16+
declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
17+
declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
18+
1519
declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
1620
declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
1721

@@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) {
7882
; CHECK-PTX64-NEXT: ret;
7983
tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr)
8084
ret void
85+
}
86+
87+
define void @prefetch_tensormap(ptr %ptr) {
88+
; CHECK-PTX64-LABEL: prefetch_tensormap(
89+
; CHECK-PTX64: {
90+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
91+
; CHECK-PTX64-EMPTY:
92+
; CHECK-PTX64-NEXT: // %bb.0:
93+
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_tensormap_param_0];
94+
; CHECK-PTX64-NEXT: prefetch.tensormap [%rd1];
95+
; CHECK-PTX64-NEXT: ret;
96+
tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
97+
ret void
98+
}
99+
100+
define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) {
101+
; CHECK-PTX64-LABEL: prefetch_const_tensormap(
102+
; CHECK-PTX64: {
103+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
104+
; CHECK-PTX64-EMPTY:
105+
; CHECK-PTX64-NEXT: // %bb.0:
106+
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_const_tensormap_param_0];
107+
; CHECK-PTX64-NEXT: prefetch.const.tensormap [%rd1];
108+
; CHECK-PTX64-NEXT: ret;
109+
tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
110+
ret void
111+
}
112+
113+
define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) {
114+
; CHECK-PTX64-LABEL: prefetch_param_tensormap(
115+
; CHECK-PTX64: {
116+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
117+
; CHECK-PTX64-EMPTY:
118+
; CHECK-PTX64-NEXT: // %bb.0:
119+
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_param_tensormap_param_0];
120+
; CHECK-PTX64-NEXT: prefetch.param.tensormap [%rd1];
121+
; CHECK-PTX64-NEXT: ret;
122+
tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
123+
ret void
81124
}

0 commit comments

Comments
 (0)