Skip to content

Commit 420a5de

Browse files
authored
[AMDGPU] Ignore inactive VGPRs in .vgpr_count (llvm#149052)
When using the `amdgcn.init.whole.wave` intrinsic, we add dummy VGPR arguments with the purpose of preserving their inactive lanes. The pattern may look something like this: ``` entry: call amdgcn.init.whole.wave branch to shader or tail shader: $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes actual code... tail: call amdgcn.cs.chain [...], implicit $vInactive ``` We should not report these VGPRs in the `.vgpr_count` metadata. This patch achieves that goal by ignoring meta instructions and calls. This should be safe since if those registers are actually used in any other context, they will be counted there. The same reasoning applies in the general case, so we don't explicitly check for the existence of `init.whole.wave`. This is a reworked version of llvm#133242, which was reverted in llvm#144039 and split into smaller bits.
1 parent bf6796f commit 420a5de

10 files changed

+349
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,9 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
241241
if (!RC || !TRI.isVGPRClass(RC))
242242
continue;
243243

244+
if (MI.isCall() || MI.isMetaInstruction())
245+
continue;
246+
244247
unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
245248
unsigned HWReg = TRI.getHWRegIndex(Reg);
246249
int MaxUsed = HWReg + Width - 1;
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
2+
3+
; CHECK-LABEL: .shader_functions:
4+
5+
; Use VGPRs above the input arguments.
6+
; CHECK-LABEL: _miss_1:
7+
; CHECK: .vgpr_count: 0x1d{{$}}
8+
9+
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
10+
i32 %vcr, { i32 } %system.data,
11+
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
12+
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
13+
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
14+
local_unnamed_addr {
15+
entry:
16+
%system.data.value = extractvalue { i32 } %system.data, 0
17+
%dead.val = call i32 @llvm.amdgcn.dead.i32()
18+
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
19+
br i1 %is.whole.wave, label %shader, label %tail
20+
21+
shader:
22+
%system.data.extract = extractvalue { i32 } %system.data, 0
23+
%data.mul = mul i32 %system.data.extract, 2
24+
%data.add = add i32 %data.mul, 1
25+
call void asm sideeffect "; clobber v28", "~{v28}"()
26+
br label %tail
27+
28+
tail:
29+
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
30+
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
31+
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
32+
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
33+
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
34+
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
35+
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
36+
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
37+
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
38+
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
39+
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
40+
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
41+
42+
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
43+
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
44+
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
45+
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
46+
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
47+
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
48+
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
49+
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
50+
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
51+
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
52+
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
53+
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
54+
55+
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
56+
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
57+
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
58+
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
59+
60+
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
61+
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
62+
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
63+
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
64+
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
65+
unreachable
66+
}
67+
68+
declare i32 @llvm.amdgcn.dead.i32()
69+
declare i1 @llvm.amdgcn.init.whole.wave()
70+
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
71+
72+
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
2+
3+
; CHECK-LABEL: .shader_functions:
4+
5+
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
6+
; CHECK-LABEL: leaf_shader:
7+
; CHECK: .vgpr_count: 0xc{{$}}
8+
9+
; Function without calls.
10+
define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
11+
i32 %active.vgpr1, i32 %active.vgpr2,
12+
i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
13+
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
14+
local_unnamed_addr {
15+
entry:
16+
%dead.val = call i32 @llvm.amdgcn.dead.i32()
17+
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
18+
br i1 %is.whole.wave, label %compute, label %merge
19+
20+
compute:
21+
; Perform a more complex computation using active VGPRs
22+
%square = mul i32 %active.vgpr1, %active.vgpr1
23+
%product = mul i32 %square, %active.vgpr2
24+
%sum = add i32 %product, %input.value
25+
%result = add i32 %sum, 42
26+
br label %merge
27+
28+
merge:
29+
%final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
30+
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
31+
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
32+
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
33+
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
34+
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
35+
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
36+
37+
store i32 %final.result, ptr %output.ptr, align 4
38+
39+
ret void
40+
}
41+
42+
declare i32 @llvm.amdgcn.dead.i32()
43+
declare i1 @llvm.amdgcn.init.whole.wave()
44+
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
45+
46+
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
2+
3+
; CHECK-LABEL: .shader_functions:
4+
5+
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
6+
; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
7+
; In that case, the VGPR should be included in the .vgpr_count
8+
; CHECK-LABEL: _miss_1:
9+
; CHECK: .vgpr_count: 0xd{{$}}
10+
11+
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
12+
i32 %vcr, { i32 } %system.data,
13+
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
14+
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
15+
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
16+
local_unnamed_addr {
17+
entry:
18+
%system.data.value = extractvalue { i32 } %system.data, 0
19+
%dead.val = call i32 @llvm.amdgcn.dead.i32()
20+
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
21+
br i1 %is.whole.wave, label %shader, label %tail
22+
23+
shader:
24+
%system.data.extract = extractvalue { i32 } %system.data, 0
25+
%data.mul = mul i32 %system.data.extract, 2
26+
%data.add = add i32 %data.mul, 1
27+
call void asm sideeffect "; clobber VGPR for %inactive.vgpr2", "~{v12}"()
28+
br label %tail
29+
30+
tail:
31+
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
32+
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
33+
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
34+
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
35+
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
36+
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
37+
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
38+
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
39+
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
40+
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
41+
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
42+
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
43+
44+
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
45+
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
46+
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
47+
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
48+
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
49+
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
50+
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
51+
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
52+
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
53+
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
54+
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
55+
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
56+
57+
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
58+
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
59+
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
60+
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
61+
62+
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
63+
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
64+
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
65+
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
66+
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
67+
unreachable
68+
}
69+
70+
declare i32 @llvm.amdgcn.dead.i32()
71+
declare i1 @llvm.amdgcn.init.whole.wave()
72+
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
73+
74+
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
2+
3+
; CHECK-LABEL: .shader_functions:
4+
5+
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
6+
; CHECK-LABEL: _miss_1:
7+
; CHECK: .vgpr_count: 0xa{{$}}
8+
9+
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
10+
i32 %vcr, { i32 } %system.data,
11+
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
12+
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
13+
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
14+
local_unnamed_addr {
15+
entry:
16+
%system.data.value = extractvalue { i32 } %system.data, 0
17+
%dead.val = call i32 @llvm.amdgcn.dead.i32()
18+
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
19+
br i1 %is.whole.wave, label %shader, label %tail
20+
21+
shader:
22+
%system.data.extract = extractvalue { i32 } %system.data, 0
23+
%data.mul = mul i32 %system.data.extract, 2
24+
%data.add = add i32 %data.mul, 1
25+
br label %tail
26+
27+
tail:
28+
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
29+
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
30+
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
31+
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
32+
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
33+
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
34+
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
35+
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
36+
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
37+
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
38+
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
39+
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
40+
41+
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
42+
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
43+
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
44+
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
45+
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
46+
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
47+
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
48+
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
49+
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
50+
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
51+
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
52+
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
53+
54+
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
55+
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
56+
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
57+
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
58+
59+
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
60+
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
61+
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
62+
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
63+
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
64+
unreachable
65+
}
66+
67+
declare i32 @llvm.amdgcn.dead.i32()
68+
declare i1 @llvm.amdgcn.init.whole.wave()
69+
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
70+
71+
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ entry:
1616
}
1717

1818
; CHECK-LABEL: __unnamed_2:
19-
; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
19+
; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
2020
; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
2121
; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
2222
; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)

llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
12641264
}
12651265

12661266
; GCN-LABEL: {{^}}f1024_0:
1267-
; GFX90A: NumVgprs: 32
1267+
; GFX90A: NumVgprs: 1
12681268
; GFX90A: NumAgprs: 1
1269-
; GFX90A: TotalNumVgprs: 33
1269+
; GFX90A: TotalNumVgprs: 5
12701270
define void @f1024_0() #1024 {
12711271
call void @foo()
12721272
ret void
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED
2+
; RUN: llc -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
3+
target triple = "amdgcn-amd-amdhsa"
4+
5+
@global = addrspace(1) global i32 poison, align 4
6+
7+
; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
8+
; Only hardware-initialized VGPRs (v0) are read in this kernel.
9+
10+
; CHECK-LABEL: amdhsa.kernels:
11+
; CHECK-LABEL: kernel_x
12+
; CHECK: .vgpr_count: 1
13+
define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
14+
entry:
15+
%id = call i32 @llvm.amdgcn.workitem.id.x()
16+
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
17+
ret void
18+
}
19+
20+
; CHECK-LABEL: kernel_z
21+
; PACKED: .vgpr_count: 1
22+
; NOTPACKED: .vgpr_count: 3
23+
define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
24+
entry:
25+
%id = call i32 @llvm.amdgcn.workitem.id.z()
26+
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
27+
ret void
28+
}
29+
30+
attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

0 commit comments

Comments
 (0)