llvm · perlfu · Aug 26, 2025 · Aug 23, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
     "amdgpu-promote-alloca-to-vector-max-regs",
     cl::desc(
         "Maximum vector size (in 32b registers) to use when promoting alloca"),
-    cl::init(16));
+    cl::init(32));
 
 // Use up to 1/4 of available register budget for vectorization.
 // FIXME: Increase the limit for whole function budgets? Perhaps x2?

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -441,8 +441,8 @@ entry:
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x i32>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x i32>], addrspace(5)
+  %tmp0 = getelementptr [3 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, ptr addrspace(5) %tmp0
   store <16 x i32> %tmp5, ptr addrspace(1) %out
   ret void
@@ -485,8 +485,8 @@ define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x float>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x float>], addrspace(5)
+  %tmp0 = getelementptr [3 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, ptr addrspace(5) %tmp0
   store <16 x float> %tmp5, ptr addrspace(1) %out
   ret void
@@ -501,8 +501,8 @@ define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v2float_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [16 x <2 x float>], addrspace(5)
-  %tmp0 = getelementptr [16 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [17 x <2 x float>], addrspace(5)
+  %tmp0 = getelementptr [17 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, ptr addrspace(5) %tmp0
   store <2 x float> %tmp5, ptr addrspace(1) %out
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -150,16 +150,17 @@ define amdgpu_cs void @with_spills() #0 {
   ret void
 }
 
-define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
 ; CHECK-LABEL: realign_stack:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
 ; CHECK-NEXT:    s_mov_b32 s1, callee@abs32@hi
 ; CHECK-NEXT:    s_cmp_lg_u32 0, s33
 ; CHECK-NEXT:    s_mov_b32 s0, callee@abs32@lo
 ; CHECK-NEXT:    s_cmovk_i32 s33, 0x200
-; CHECK-NEXT:    s_movk_i32 s32, 0x100
-; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    s_movk_i32 s32, 0x180
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    scratch_store_b32 off, v32, s33 offset:128
 ; CHECK-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:112
 ; CHECK-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:96
 ; CHECK-NEXT:    scratch_store_b128 off, v[20:23], s33 offset:80
@@ -169,12 +170,12 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-NEXT:    scratch_store_b128 off, v[4:7], s33 offset:16
 ; CHECK-NEXT:    scratch_store_b128 off, v[0:3], s33
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    s_cmovk_i32 s32, 0x300
+; CHECK-NEXT:    s_cmovk_i32 s32, 0x380
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-NEXT:    s_alloc_vgpr 0
 ; CHECK-NEXT:    s_endpgm
-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -31,11 +31,11 @@ define amdgpu_cs void @with_calls() #0 {
   ret void
 }
 
-define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
 ; CHECK-LABEL: {{^}}name: realign_stack
 ; CHECK: scratchReservedForDynamicVGPRs: 512
-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }