Skip to content

Commit 1f6648c

Browse files
authored
[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 (#155076)
Increase promote-alloca-to-vector-max-regs to 32 from 16. This restores default promotion of 16 x double which was disabled by #127973. Fixes SWDEV-525817.
1 parent 3c91d58 commit 1f6648c

File tree

6 files changed

+149
-127
lines changed

6 files changed

+149
-127
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
7070
"amdgpu-promote-alloca-to-vector-max-regs",
7171
cl::desc(
7272
"Maximum vector size (in 32b registers) to use when promoting alloca"),
73-
cl::init(16));
73+
cl::init(32));
7474

7575
// Use up to 1/4 of available register budget for vectorization.
7676
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
@@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
287287

288288
void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289289
// Load per function limits, overriding with global options where appropriate.
290+
// R600 register tuples/aliasing are fragile with large vector promotions so
291+
// apply architecture specific limit here.
292+
const int R600MaxVectorRegs = 16;
290293
MaxVectorRegs = F.getFnAttributeAsParsedInteger(
291-
"amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
294+
"amdgpu-promote-alloca-to-vector-max-regs",
295+
IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
292296
if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
293297
MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
294298
VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(

llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,26 +154,31 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
154154
; CHECK-LABEL: realign_stack:
155155
; CHECK: ; %bb.0:
156156
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
157-
; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi
157+
; CHECK-NEXT: v_mov_b32_e32 v32, 0
158158
; CHECK-NEXT: s_cmp_lg_u32 0, s33
159-
; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo
159+
; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi
160160
; CHECK-NEXT: s_cmovk_i32 s33, 0x200
161-
; CHECK-NEXT: s_movk_i32 s32, 0x100
161+
; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo
162+
; CHECK-NEXT: scratch_store_b32 off, v32, s33 scope:SCOPE_SYS
163+
; CHECK-NEXT: s_wait_storecnt 0x0
162164
; CHECK-NEXT: s_clause 0x7
163-
; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
164165
; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96
165-
; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
166+
; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
166167
; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64
167-
; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48
168+
; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
168169
; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32
170+
; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48
169171
; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16
170172
; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33
171173
; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
174+
; CHECK-NEXT: s_movk_i32 s32, 0x100
172175
; CHECK-NEXT: s_cmovk_i32 s32, 0x300
173176
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
174177
; CHECK-NEXT: s_alloc_vgpr 0
175178
; CHECK-NEXT: s_endpgm
176179
%v = alloca <32 x i32>, align 128, addrspace(5)
180+
; use volatile store to avoid promotion of alloca to registers
181+
store volatile i32 0, ptr addrspace(5) %v
177182
store <32 x i32> %x, ptr addrspace(5) %v
178183
call amdgpu_gfx void @callee(i32 71)
179184
ret void

llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
3535
; CHECK-LABEL: {{^}}name: realign_stack
3636
; CHECK: scratchReservedForDynamicVGPRs: 512
3737
%v = alloca <32 x i32>, align 128, addrspace(5)
38-
store <32 x i32> %x, ptr addrspace(5) %v
38+
; use volatile store to avoid promotion of alloca to registers
39+
store volatile <32 x i32> %x, ptr addrspace(5) %v
3940
call amdgpu_gfx void @callee(i32 71)
4041
ret void
4142
}

0 commit comments

Comments
 (0)