[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 #155076

perlfu · 2025-08-23T05:02:27Z

Increase promote-alloca-to-vector-max-regs to 32 from 16.
This restores default promotion of 16 x double which was disabled by #127973.

Fixes SWDEV-525817.

Increase promote-alloca-to-vector-max-regs to 32 from 16. This restores default promotion of 16 x double which was disabled by llvm#127973.

llvmbot · 2025-08-23T05:02:58Z

@llvm/pr-subscribers-backend-amdgpu

Author: Carl Ritson (perlfu)

Changes

Increase promote-alloca-to-vector-max-regs to 32 from 16.
This restores default promotion of 16 x double which was disabled by #127973.

Fixes SWDEV-525817.

Patch is 27.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155076.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+1-1)
(modified) llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (+6-6)
(modified) llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll (+7-6)
(modified) llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll (+128-116)
(modified) llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll (+1-1)
(modified) llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll (+1-1)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f226c7f381aa2..d988a89a506b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
     "amdgpu-promote-alloca-to-vector-max-regs",
     cl::desc(
         "Maximum vector size (in 32b registers) to use when promoting alloca"),
-    cl::init(16));
+    cl::init(32));
 
 // Use up to 1/4 of available register budget for vectorization.
 // FIXME: Increase the limit for whole function budgets? Perhaps x2?
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index f4b90b4293a46..4859e291b0613 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -441,8 +441,8 @@ entry:
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x i32>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x i32>], addrspace(5)
+  %tmp0 = getelementptr [3 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, ptr addrspace(5) %tmp0
   store <16 x i32> %tmp5, ptr addrspace(1) %out
   ret void
@@ -485,8 +485,8 @@ define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x float>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x float>], addrspace(5)
+  %tmp0 = getelementptr [3 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, ptr addrspace(5) %tmp0
   store <16 x float> %tmp5, ptr addrspace(1) %out
   ret void
@@ -501,8 +501,8 @@ define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v2float_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [16 x <2 x float>], addrspace(5)
-  %tmp0 = getelementptr [16 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [17 x <2 x float>], addrspace(5)
+  %tmp0 = getelementptr [17 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, ptr addrspace(5) %tmp0
   store <2 x float> %tmp5, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index 3f499535400ef..322703df1fcd9 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -150,7 +150,7 @@ define amdgpu_cs void @with_spills() #0 {
   ret void
 }
 
-define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
 ; CHECK-LABEL: realign_stack:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@@ -158,8 +158,9 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-NEXT:    s_cmp_lg_u32 0, s33
 ; CHECK-NEXT:    s_mov_b32 s0, callee@abs32@lo
 ; CHECK-NEXT:    s_cmovk_i32 s33, 0x200
-; CHECK-NEXT:    s_movk_i32 s32, 0x100
-; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    s_movk_i32 s32, 0x180
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    scratch_store_b32 off, v32, s33 offset:128
 ; CHECK-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:112
 ; CHECK-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:96
 ; CHECK-NEXT:    scratch_store_b128 off, v[20:23], s33 offset:80
@@ -169,12 +170,12 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-NEXT:    scratch_store_b128 off, v[4:7], s33 offset:16
 ; CHECK-NEXT:    scratch_store_b128 off, v[0:3], s33
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    s_cmovk_i32 s32, 0x300
+; CHECK-NEXT:    s_cmovk_i32 s32, 0x380
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-NEXT:    s_alloc_vgpr 0
 ; CHECK-NEXT:    s_endpgm
-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
index cd428be729ae2..94ae31ccfb4ae 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -31,11 +31,11 @@ define amdgpu_cs void @with_calls() #0 {
   ret void
 }
 
-define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
 ; CHECK-LABEL: {{^}}name: realign_stack
 ; CHECK: scratchReservedForDynamicVGPRs: 512
-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
index ad42748ab3d60..c1123d7b515be 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
@@ -1,9 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --check-prefix=BASE --check-prefix=MAX16 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE %s --check-prefix=DEFAULT
 
 define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX16-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; MAX16-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; MAX16-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; MAX16-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX16-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
+; MAX24-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX24-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
 ; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
 ; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -19,12 +51,50 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
 ; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
 ; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
 ; DEFAULT-NEXT:    ret void
 ;
-; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
-; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; MAX16-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; MAX16-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; MAX16-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; MAX16-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX16-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
 ; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
 ; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
@@ -36,18 +106,18 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; MAX24-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
 ; MAX24-NEXT:    ret void
 ;
-; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements(
-; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
-; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; MAX32-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
-; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
-; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
-; MAX32-NEXT:    ret void
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
 ;
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -67,18 +137,24 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
   ret void
 }
 
-define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
-; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
-; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
+; BASE-LABEL: define amdgpu_kernel void @i32_32_elements(
+; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; BASE-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; BASE-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
 ; BASE-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
 ; BASE-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
 ; BASE-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
 ; BASE-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; BASE-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
-; BASE-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
-; BASE-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; BASE-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; BASE-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; BASE-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; BASE-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; BASE-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; BASE-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; BASE-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; BASE-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; BASE-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; BASE-NEXT:    ret void
 ;
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -87,40 +163,40 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
   %c2 = icmp uge i32 %y, 3
   %sel1 = select i1 %c1, i32 1, i32 2
   %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %alloca = alloca [24 x i32], align 16, addrspace(5)
-  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
-  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
-  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  %alloca = alloca [32 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+  %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
   store i32 42, ptr addrspace(5) %gep.0
   store i32 43, ptr addrspace(5) %gep.1
-  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
   %load = load i32, ptr addrspace(5) %gep
   store i32 %load, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
-; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements(
-; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
-; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
-; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
-; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
-; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
-; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
-; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
-; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
-; DEFAULT-NEXT:    ret void
+define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX16-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX16-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX16-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX16-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT:    store i32 [[TMP1]]...
[truncated]

arsenm

Generally I think using volatile is the best way to defeat the optimization for the tests intended to use the stack, but it's not that important

arsenm · 2025-08-23T11:21:49Z

llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll

 define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x i32>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x i32>], addrspace(5)


These cases probably should have been defeated through volatile or adding the flag but I guess it doesn't really matter

arsenm · 2025-08-23T11:22:03Z

llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll

-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v


llvm-ci · 2025-08-26T01:35:59Z

LLVM Buildbot has detected a new failure on builder clang-aarch64-sve-vls running on linaro-g3-04 while building llvm at step 7 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/143/builds/10298

Here is the relevant piece of the build log for the reference

Step 7 (ninja check 1) failure: stage 1 checked (failure)
...
[436/792] Building CXX object unittests/Object/CMakeFiles/ObjectTests.dir/ObjectFileTest.cpp.o
[437/792] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Serialization/VarDeclConstantInitTest.cpp.o
[438/792] Building CXX object unittests/Remarks/CMakeFiles/RemarksTests.dir/RemarksLinkingTest.cpp.o
[439/792] Building CXX object unittests/Object/CMakeFiles/ObjectTests.dir/SymbolicFileTest.cpp.o
[440/792] Building CXX object unittests/ObjectYAML/CMakeFiles/ObjectYAMLTests.dir/YAML2ObjTest.cpp.o
[441/792] Building CXX object unittests/ProfileData/CMakeFiles/ProfileDataTests.dir/SymbolRemappingReaderTest.cpp.o
[442/792] Building CXX object unittests/Object/CMakeFiles/ObjectTests.dir/OffloadingTest.cpp.o
[443/792] Building CXX object unittests/Remarks/CMakeFiles/RemarksTests.dir/YAMLRemarksSerializerTest.cpp.o
[443/792] cd /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/runtimes/runtimes-bins && /usr/local/bin/cmake --build /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/runtimes/runtimes-bins/ --target runtimes-test-depends --config Release
[1/1] Linking CXX executable flang-rt/unittests/Runtime/RuntimeTests
FAILED: flang-rt/unittests/Runtime/RuntimeTests 
: && /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/./bin/clang++ --target=aarch64-unknown-linux-gnu -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -O3 -DNDEBUG -fuse-ld=lld -Wl,--color-diagnostics    -Wl,--gc-sections flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/AccessTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Assign.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/BufferTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CharacterTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CommandTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Complex.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CrashHandlerFixture.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Derived.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Descriptor.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ExternalIOTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Format.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/InputExtensions.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Inquiry.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ListInputTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/LogicalFormatTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Matmul.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/MatmulTranspose.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/MiscIntrinsic.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Namelist.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Numeric.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/NumericalFormatTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Pointer.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Ragged.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Random.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Reduction.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/RuntimeCrashTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Stop.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Support.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Time.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/TemporaryStack.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Transformational.cpp.o -o flang-rt/unittests/Runtime/RuntimeTests  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libLLVMSupport.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libllvm_gtest_main.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libllvm_gtest.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/clang/22/lib/aarch64-unknown-linux-gnu/libflang_rt.runtime.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libLLVMSupport.a  -lrt  -ldl  -lm  /usr/lib/aarch64-linux-gnu/libz.so  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libLLVMDemangle.a  -lpthread && :
ld.lld: error: undefined symbol: testing::internal::GetBoolAssertionFailureMessage[abi:cxx11](testing::AssertionResult const&, char const*, char const*, char const*)
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(AllocatableTest_MoveAlloc_Test::TestBody())
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(AllocatableTest_MoveAlloc_Test::TestBody())
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(AllocatableTest_MoveAlloc_Test::TestBody())
>>> referenced 338 more times

ld.lld: error: undefined symbol: llvm::Twine::str[abi:cxx11]() const
>>> referenced by AccessTest.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/AccessTest.cpp.o:(createTemporaryFile[abi:cxx11](char const*, (anonymous namespace)::AccessType const&))

ld.lld: error: undefined symbol: testing::internal::EqFailure(char const*, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, bool)
>>> referenced by BufferTest.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/BufferTest.cpp.o:(BufferTests_TestFrameBufferReadAndWrite_Test::TestBody())
>>> referenced by Assign.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Assign.cpp.o:(testing::AssertionResult testing::internal::CmpHelperEQ<Fortran::runtime::TypeCode, Fortran::runtime::TypeCode>(char const*, char const*, Fortran::runtime::TypeCode const&, Fortran::runtime::TypeCode const&))
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(testing::AssertionResult testing::internal::CmpHelperEQ<std::basic_string_view<char, std::char_traits<char>>, char [50]>(char const*, char const*, std::basic_string_view<char, std::char_traits<char>> const&, char const (&) [50]))
>>> referenced 46 more times

ld.lld: error: undefined symbol: testing::internal::PrintStringTo(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, std::ostream*)
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> testing::PrintToString<std::basic_string_view<char, std::char_traits<char>>>(std::basic_string_view<char, std::char_traits<char>> const&))
>>> referenced by ArrayConstructor.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o:(testing::internal::MatchesRegexMatcher::DescribeTo(std::ostream*) const)
>>> referenced by ArrayConstructor.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o:(testing::internal::MatchesRegexMatcher::DescribeNegationTo(std::ostream*) const)
>>> referenced 4 more times

ld.lld: error: undefined symbol: testing::internal::DeathTest::Create(char const*, testing::Matcher<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&>, char const*, int, testing::internal::DeathTest**)
>>> referenced by ArrayConstructor.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o:(ArrayConstructor_CharacterRuntimeCheck_Test::TestBody())
>>> referenced by CommandTest.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CommandTest.cpp.o:(ZeroArguments_ECLInvalidCommandTerminatedSync_Test::TestBody())
>>> referenced by ListInputTest.cpp

…55076) Increase promote-alloca-to-vector-max-regs to 32 from 16. This restores default promotion of 16 x double which was disabled by Fixes SWDEV-525817.

…55076) (llvm#3746)

[AMDGPU] AMDGPUPromoteAlloca: increase max-regs to 32

4ce05f4

Increase promote-alloca-to-vector-max-regs to 32 from 16. This restores default promotion of 16 x double which was disabled by llvm#127973.

perlfu requested review from arsenm, jayfoad and rovka August 23, 2025 05:02

llvmbot added the backend:AMDGPU label Aug 23, 2025

perlfu requested a review from rampitec August 23, 2025 05:07

arsenm approved these changes Aug 23, 2025

View reviewed changes

perlfu added 2 commits August 25, 2025 12:17

- Use volatile accesses

01b580d

- R600 specific limit, removing R600 related test changes

66181d2

rampitec approved these changes Aug 25, 2025

View reviewed changes

perlfu merged commit 1f6648c into llvm:main Aug 26, 2025
10 of 12 checks passed

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Oct 9, 2025

[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 (llvm#1…

96f63b4

…55076) (llvm#3746)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 #155076

[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 #155076

Uh oh!

perlfu commented Aug 23, 2025

Uh oh!

llvmbot commented Aug 23, 2025

Uh oh!

arsenm left a comment

Uh oh!

arsenm Aug 23, 2025

Uh oh!

arsenm Aug 23, 2025

Uh oh!

Uh oh!

llvm-ci commented Aug 26, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 #155076

[AMDGPU] AMDGPUPromoteAlloca: increase default max-regs to 32 #155076

Uh oh!

Conversation

perlfu commented Aug 23, 2025

Uh oh!

llvmbot commented Aug 23, 2025

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm Aug 23, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Aug 23, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Aug 26, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants