llvm · perlfu · Aug 26, 2025 · Aug 23, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
     "amdgpu-promote-alloca-to-vector-max-regs",
     cl::desc(
         "Maximum vector size (in 32b registers) to use when promoting alloca"),
-    cl::init(16));
+    cl::init(32));
 
 // Use up to 1/4 of available register budget for vectorization.
 // FIXME: Increase the limit for whole function budgets? Perhaps x2?
@@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
 
 void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
   // Load per function limits, overriding with global options where appropriate.
+  // R600 register tuples/aliasing are fragile with large vector promotions so
+  // apply architecture specific limit here.
+  const int R600MaxVectorRegs = 16;
   MaxVectorRegs = F.getFnAttributeAsParsedInteger(
-      "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
+      "amdgpu-promote-alloca-to-vector-max-regs",
+      IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
   if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
     MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
   VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(

diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -154,26 +154,31 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-LABEL: realign_stack:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_cmp_lg_u32 0, s33
-; CHECK-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; CHECK-NEXT:    s_mov_b32 s1, callee@abs32@hi
 ; CHECK-NEXT:    s_cmovk_i32 s33, 0x200
-; CHECK-NEXT:    s_movk_i32 s32, 0x100
+; CHECK-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; CHECK-NEXT:    scratch_store_b32 off, v32, s33 scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:112
 ; CHECK-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:96
-; CHECK-NEXT:    scratch_store_b128 off, v[20:23], s33 offset:80
+; CHECK-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:112
 ; CHECK-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:64
-; CHECK-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:48
+; CHECK-NEXT:    scratch_store_b128 off, v[20:23], s33 offset:80
 ; CHECK-NEXT:    scratch_store_b128 off, v[8:11], s33 offset:32
+; CHECK-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:48
 ; CHECK-NEXT:    scratch_store_b128 off, v[4:7], s33 offset:16
 ; CHECK-NEXT:    scratch_store_b128 off, v[0:3], s33
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT:    s_movk_i32 s32, 0x100
 ; CHECK-NEXT:    s_cmovk_i32 s32, 0x300
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-NEXT:    s_alloc_vgpr 0
 ; CHECK-NEXT:    s_endpgm
   %v = alloca <32 x i32>, align 128, addrspace(5)
+  ; use volatile store to avoid promotion of alloca to registers
+  store volatile i32 0, ptr addrspace(5) %v
   store <32 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -35,7 +35,8 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-LABEL: {{^}}name: realign_stack
 ; CHECK: scratchReservedForDynamicVGPRs: 512
   %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  ; use volatile store to avoid promotion of alloca to registers
+  store volatile <32 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }