Skip to content

Conversation

@perlfu
Copy link
Contributor

@perlfu perlfu commented Aug 23, 2025

Increase promote-alloca-to-vector-max-regs to 32 from 16.
This restores default promotion of 16 x double which was disabled by #127973.

Fixes SWDEV-525817.

Increase promote-alloca-to-vector-max-regs to 32 from 16.
This restores default promotion of 16 x double which was disabled
by llvm#127973.
@llvmbot
Copy link
Member

llvmbot commented Aug 23, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Carl Ritson (perlfu)

Changes

Increase promote-alloca-to-vector-max-regs to 32 from 16.
This restores default promotion of 16 x double which was disabled by #127973.

Fixes SWDEV-525817.


Patch is 27.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155076.diff

7 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (+6-6)
  • (modified) llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll (+7-6)
  • (modified) llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll (+3-3)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll (+128-116)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll (+1-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f226c7f381aa2..d988a89a506b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
     "amdgpu-promote-alloca-to-vector-max-regs",
     cl::desc(
         "Maximum vector size (in 32b registers) to use when promoting alloca"),
-    cl::init(16));
+    cl::init(32));
 
 // Use up to 1/4 of available register budget for vectorization.
 // FIXME: Increase the limit for whole function budgets? Perhaps x2?
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index f4b90b4293a46..4859e291b0613 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -441,8 +441,8 @@ entry:
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x i32>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x i32>], addrspace(5)
+  %tmp0 = getelementptr [3 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, ptr addrspace(5) %tmp0
   store <16 x i32> %tmp5, ptr addrspace(1) %out
   ret void
@@ -485,8 +485,8 @@ define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [2 x <16 x float>], addrspace(5)
-  %tmp0 = getelementptr [2 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [3 x <16 x float>], addrspace(5)
+  %tmp0 = getelementptr [3 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, ptr addrspace(5) %tmp0
   store <16 x float> %tmp5, ptr addrspace(1) %out
   ret void
@@ -501,8 +501,8 @@ define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) {
 ; SI: buffer_load_dword
 
 define amdgpu_kernel void @v2float_stack(ptr addrspace(1) %out, i32 %a) {
-  %alloca = alloca [16 x <2 x float>], addrspace(5)
-  %tmp0 = getelementptr [16 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %alloca = alloca [17 x <2 x float>], addrspace(5)
+  %tmp0 = getelementptr [17 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, ptr addrspace(5) %tmp0
   store <2 x float> %tmp5, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index 3f499535400ef..322703df1fcd9 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -150,7 +150,7 @@ define amdgpu_cs void @with_spills() #0 {
   ret void
 }
 
-define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
 ; CHECK-LABEL: realign_stack:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@@ -158,8 +158,9 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-NEXT:    s_cmp_lg_u32 0, s33
 ; CHECK-NEXT:    s_mov_b32 s0, callee@abs32@lo
 ; CHECK-NEXT:    s_cmovk_i32 s33, 0x200
-; CHECK-NEXT:    s_movk_i32 s32, 0x100
-; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    s_movk_i32 s32, 0x180
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    scratch_store_b32 off, v32, s33 offset:128
 ; CHECK-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:112
 ; CHECK-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:96
 ; CHECK-NEXT:    scratch_store_b128 off, v[20:23], s33 offset:80
@@ -169,12 +170,12 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 ; CHECK-NEXT:    scratch_store_b128 off, v[4:7], s33 offset:16
 ; CHECK-NEXT:    scratch_store_b128 off, v[0:3], s33
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    s_cmovk_i32 s32, 0x300
+; CHECK-NEXT:    s_cmovk_i32 s32, 0x380
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-NEXT:    s_alloc_vgpr 0
 ; CHECK-NEXT:    s_endpgm
-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
index cd428be729ae2..94ae31ccfb4ae 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -31,11 +31,11 @@ define amdgpu_cs void @with_calls() #0 {
   ret void
 }
 
-define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+define amdgpu_cs void @realign_stack(<33 x i32> %x) #0 {
 ; CHECK-LABEL: {{^}}name: realign_stack
 ; CHECK: scratchReservedForDynamicVGPRs: 512
-  %v = alloca <32 x i32>, align 128, addrspace(5)
-  store <32 x i32> %x, ptr addrspace(5) %v
+  %v = alloca <33 x i32>, align 128, addrspace(5)
+  store <33 x i32> %x, ptr addrspace(5) %v
   call amdgpu_gfx void @callee(i32 71)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
index ad42748ab3d60..c1123d7b515be 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
@@ -1,9 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --check-prefix=BASE --check-prefix=MAX16 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE %s --check-prefix=DEFAULT
 
 define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX16-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; MAX16-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; MAX16-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; MAX16-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX16-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
+; MAX24-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX24-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
 ; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
 ; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -19,12 +51,50 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
 ; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
 ; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
 ; DEFAULT-NEXT:    ret void
 ;
-; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
-; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; MAX16-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; MAX16-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; MAX16-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; MAX16-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX16-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
 ; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
 ; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
@@ -36,18 +106,18 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; MAX24-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
 ; MAX24-NEXT:    ret void
 ;
-; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements(
-; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
-; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; MAX32-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
-; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
-; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
-; MAX32-NEXT:    ret void
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
 ;
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -67,18 +137,24 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
   ret void
 }
 
-define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
-; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
-; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
+; BASE-LABEL: define amdgpu_kernel void @i32_32_elements(
+; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; BASE-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; BASE-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
 ; BASE-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
 ; BASE-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
 ; BASE-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
 ; BASE-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; BASE-NEXT:    [[ALLOCA:%.*]] = freeze <24 x i32> poison
-; BASE-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
-; BASE-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; BASE-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; BASE-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; BASE-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; BASE-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; BASE-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; BASE-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; BASE-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; BASE-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; BASE-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; BASE-NEXT:    ret void
 ;
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -87,40 +163,40 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
   %c2 = icmp uge i32 %y, 3
   %sel1 = select i1 %c1, i32 1, i32 2
   %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %alloca = alloca [24 x i32], align 16, addrspace(5)
-  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
-  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
-  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  %alloca = alloca [32 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+  %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
   store i32 42, ptr addrspace(5) %gep.0
   store i32 43, ptr addrspace(5) %gep.1
-  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
   %load = load i32, ptr addrspace(5) %gep
   store i32 %load, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
-; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements(
-; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
-; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
-; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
-; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
-; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
-; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
-; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
-; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
-; DEFAULT-NEXT:    ret void
+define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX16-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX16-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX16-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX16-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT:    store i32 [[TMP1]]...
[truncated]

@perlfu perlfu requested a review from rampitec August 23, 2025 05:07
Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally I think using volatile is the best way to defeat the optimization for the tests intended to use the stack, but it's not that important

define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) {
%alloca = alloca [2 x <16 x i32>], addrspace(5)
%tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a
%alloca = alloca [3 x <16 x i32>], addrspace(5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These cases probably should have been defeated through volatile or adding the flag but I guess it doesn't really matter

%v = alloca <32 x i32>, align 128, addrspace(5)
store <32 x i32> %x, ptr addrspace(5) %v
%v = alloca <33 x i32>, align 128, addrspace(5)
store <33 x i32> %x, ptr addrspace(5) %v
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

volatile?

@perlfu perlfu merged commit 1f6648c into llvm:main Aug 26, 2025
10 of 12 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Aug 26, 2025

LLVM Buildbot has detected a new failure on builder clang-aarch64-sve-vls running on linaro-g3-04 while building llvm at step 7 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/143/builds/10298

Here is the relevant piece of the build log for the reference
Step 7 (ninja check 1) failure: stage 1 checked (failure)
...
[436/792] Building CXX object unittests/Object/CMakeFiles/ObjectTests.dir/ObjectFileTest.cpp.o
[437/792] Building CXX object tools/clang/unittests/CMakeFiles/AllClangUnitTests.dir/Serialization/VarDeclConstantInitTest.cpp.o
[438/792] Building CXX object unittests/Remarks/CMakeFiles/RemarksTests.dir/RemarksLinkingTest.cpp.o
[439/792] Building CXX object unittests/Object/CMakeFiles/ObjectTests.dir/SymbolicFileTest.cpp.o
[440/792] Building CXX object unittests/ObjectYAML/CMakeFiles/ObjectYAMLTests.dir/YAML2ObjTest.cpp.o
[441/792] Building CXX object unittests/ProfileData/CMakeFiles/ProfileDataTests.dir/SymbolRemappingReaderTest.cpp.o
[442/792] Building CXX object unittests/Object/CMakeFiles/ObjectTests.dir/OffloadingTest.cpp.o
[443/792] Building CXX object unittests/Remarks/CMakeFiles/RemarksTests.dir/YAMLRemarksSerializerTest.cpp.o
[443/792] cd /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/runtimes/runtimes-bins && /usr/local/bin/cmake --build /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/runtimes/runtimes-bins/ --target runtimes-test-depends --config Release
[1/1] Linking CXX executable flang-rt/unittests/Runtime/RuntimeTests
FAILED: flang-rt/unittests/Runtime/RuntimeTests 
: && /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/./bin/clang++ --target=aarch64-unknown-linux-gnu -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -O3 -DNDEBUG -fuse-ld=lld -Wl,--color-diagnostics    -Wl,--gc-sections flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/AccessTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Assign.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/BufferTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CharacterTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CommandTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Complex.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CrashHandlerFixture.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Derived.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Descriptor.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ExternalIOTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Format.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/InputExtensions.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Inquiry.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ListInputTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/LogicalFormatTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Matmul.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/MatmulTranspose.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/MiscIntrinsic.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Namelist.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Numeric.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/NumericalFormatTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Pointer.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Ragged.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Random.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Reduction.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/RuntimeCrashTest.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Stop.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Support.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Time.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/TemporaryStack.cpp.o flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Transformational.cpp.o -o flang-rt/unittests/Runtime/RuntimeTests  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libLLVMSupport.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libllvm_gtest_main.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libllvm_gtest.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/clang/22/lib/aarch64-unknown-linux-gnu/libflang_rt.runtime.a  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libLLVMSupport.a  -lrt  -ldl  -lm  /usr/lib/aarch64-linux-gnu/libz.so  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls/stage1/lib/libLLVMDemangle.a  -lpthread && :
ld.lld: error: undefined symbol: testing::internal::GetBoolAssertionFailureMessage[abi:cxx11](testing::AssertionResult const&, char const*, char const*, char const*)
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(AllocatableTest_MoveAlloc_Test::TestBody())
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(AllocatableTest_MoveAlloc_Test::TestBody())
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(AllocatableTest_MoveAlloc_Test::TestBody())
>>> referenced 338 more times

ld.lld: error: undefined symbol: llvm::Twine::str[abi:cxx11]() const
>>> referenced by AccessTest.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/AccessTest.cpp.o:(createTemporaryFile[abi:cxx11](char const*, (anonymous namespace)::AccessType const&))

ld.lld: error: undefined symbol: testing::internal::EqFailure(char const*, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, bool)
>>> referenced by BufferTest.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/BufferTest.cpp.o:(BufferTests_TestFrameBufferReadAndWrite_Test::TestBody())
>>> referenced by Assign.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Assign.cpp.o:(testing::AssertionResult testing::internal::CmpHelperEQ<Fortran::runtime::TypeCode, Fortran::runtime::TypeCode>(char const*, char const*, Fortran::runtime::TypeCode const&, Fortran::runtime::TypeCode const&))
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(testing::AssertionResult testing::internal::CmpHelperEQ<std::basic_string_view<char, std::char_traits<char>>, char [50]>(char const*, char const*, std::basic_string_view<char, std::char_traits<char>> const&, char const (&) [50]))
>>> referenced 46 more times

ld.lld: error: undefined symbol: testing::internal::PrintStringTo(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, std::ostream*)
>>> referenced by Allocatable.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/Allocatable.cpp.o:(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> testing::PrintToString<std::basic_string_view<char, std::char_traits<char>>>(std::basic_string_view<char, std::char_traits<char>> const&))
>>> referenced by ArrayConstructor.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o:(testing::internal::MatchesRegexMatcher::DescribeTo(std::ostream*) const)
>>> referenced by ArrayConstructor.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o:(testing::internal::MatchesRegexMatcher::DescribeNegationTo(std::ostream*) const)
>>> referenced 4 more times

ld.lld: error: undefined symbol: testing::internal::DeathTest::Create(char const*, testing::Matcher<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&>, char const*, int, testing::internal::DeathTest**)
>>> referenced by ArrayConstructor.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/ArrayConstructor.cpp.o:(ArrayConstructor_CharacterRuntimeCheck_Test::TestBody())
>>> referenced by CommandTest.cpp
>>>               flang-rt/unittests/Runtime/CMakeFiles/RuntimeTests.dir/CommandTest.cpp.o:(ZeroArguments_ECLInvalidCommandTerminatedSync_Test::TestBody())
>>> referenced by ListInputTest.cpp

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Oct 9, 2025
…55076)

Increase promote-alloca-to-vector-max-regs to 32 from 16.
This restores default promotion of 16 x double which was disabled by

Fixes SWDEV-525817.
searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Oct 9, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants