Skip to content

Commit 4ce05f4

Browse files
committed
[AMDGPU] AMDGPUPromoteAlloca: increase max-regs to 32
Increase promote-alloca-to-vector-max-regs to 32 from 16. This restores default promotion of 16 x double which was disabled by llvm#127973.
1 parent 9dc66a6 commit 4ce05f4

File tree

7 files changed

+147
-134
lines changed

7 files changed

+147
-134
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
7070
"amdgpu-promote-alloca-to-vector-max-regs",
7171
cl::desc(
7272
"Maximum vector size (in 32b registers) to use when promoting alloca"),
73-
cl::init(16));
73+
cl::init(32));
7474

7575
// Use up to 1/4 of available register budget for vectorization.
7676
// FIXME: Increase the limit for whole function budgets? Perhaps x2?

llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,8 @@ entry:
441441
; SI: buffer_load_dword
442442

443443
define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
444-
%alloca = alloca [2 x <16 x i32>], addrspace(5)
445-
%tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
444+
%alloca = alloca [3 x <16 x i32>], addrspace(5)
445+
%tmp0 = getelementptr [3 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
446446
%tmp5 = load <16 x i32>, ptr addrspace(5) %tmp0
447447
store <16 x i32> %tmp5, ptr addrspace(1) %out
448448
ret void
@@ -485,8 +485,8 @@ define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
485485
; SI: buffer_load_dword
486486

487487
define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
488-
%alloca = alloca [2 x <16 x float>], addrspace(5)
489-
%tmp0 = getelementptr [2 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
488+
%alloca = alloca [3 x <16 x float>], addrspace(5)
489+
%tmp0 = getelementptr [3 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
490490
%tmp5 = load <16 x float>, ptr addrspace(5) %tmp0
491491
store <16 x float> %tmp5, ptr addrspace(1) %out
492492
ret void
@@ -501,8 +501,8 @@ define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
501501
; SI: buffer_load_dword
502502

503503
define amdgpu_kernel void @v2float_stack(ptr addrspace(1) %out, i32 %a) {
504-
%alloca = alloca [16 x <2 x float>], addrspace(5)
505-
%tmp0 = getelementptr [16 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
504+
%alloca = alloca [17 x <2 x float>], addrspace(5)
505+
%tmp0 = getelementptr [17 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
506506
%tmp5 = load <2 x float>, ptr addrspace(5) %tmp0
507507
store <2 x float> %tmp5, ptr addrspace(1) %out
508508
ret void

llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,16 +150,17 @@ define amdgpu_cs void @with_spills() #0 {
150150
ret void
151151
}
152152

153-
define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
153+
define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
154154
; CHECK-LABEL: realign_stack:
155155
; CHECK: ; %bb.0:
156156
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
157157
; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi
158158
; CHECK-NEXT: s_cmp_lg_u32 0, s33
159159
; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo
160160
; CHECK-NEXT: s_cmovk_i32 s33, 0x200
161-
; CHECK-NEXT: s_movk_i32 s32, 0x100
162-
; CHECK-NEXT: s_clause 0x7
161+
; CHECK-NEXT: s_movk_i32 s32, 0x180
162+
; CHECK-NEXT: s_clause 0x8
163+
; CHECK-NEXT: scratch_store_b32 off, v32, s33 offset:128
163164
; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
164165
; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96
165166
; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
@@ -169,12 +170,12 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
169170
; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16
170171
; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33
171172
; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
172-
; CHECK-NEXT: s_cmovk_i32 s32, 0x300
173+
; CHECK-NEXT: s_cmovk_i32 s32, 0x380
173174
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
174175
; CHECK-NEXT: s_alloc_vgpr 0
175176
; CHECK-NEXT: s_endpgm
176-
%v = alloca <32 x i32>, align 128, addrspace(5)
177-
store <32 x i32> %x, ptr addrspace(5) %v
177+
%v = alloca <33 x i32>, align 128, addrspace(5)
178+
store <33 x i32> %x, ptr addrspace(5) %v
178179
call amdgpu_gfx void @callee(i32 71)
179180
ret void
180181
}

llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ define amdgpu_cs void @with_calls() #0 {
3131
ret void
3232
}
3333

34-
define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
34+
define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
3535
; CHECK-LABEL: {{^}}name: realign_stack
3636
; CHECK: scratchReservedForDynamicVGPRs: 512
37-
%v = alloca <32 x i32>, align 128, addrspace(5)
38-
store <32 x i32> %x, ptr addrspace(5) %v
37+
%v = alloca <33 x i32>, align 128, addrspace(5)
38+
store <33 x i32> %x, ptr addrspace(5) %v
3939
call amdgpu_gfx void @callee(i32 71)
4040
ret void
4141
}

0 commit comments

Comments
 (0)