diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index f226c7f381aa2..8617d868ef8ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -70,7 +70,7 @@ static cl::opt PromoteAllocaToVectorMaxRegs( "amdgpu-promote-alloca-to-vector-max-regs", cl::desc( "Maximum vector size (in 32b registers) to use when promoting alloca"), - cl::init(16)); + cl::init(32)); // Use up to 1/4 of available register budget for vectorization. // FIXME: Increase the limit for whole function budgets? Perhaps x2? @@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { // Load per function limits, overriding with global options where appropriate. + // R600 register tuples/aliasing are fragile with large vector promotions so + // apply architecture specific limit here. + const int R600MaxVectorRegs = 16; MaxVectorRegs = F.getFnAttributeAsParsedInteger( - "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs); + "amdgpu-promote-alloca-to-vector-max-regs", + IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs); if (PromoteAllocaToVectorMaxRegs.getNumOccurrences()) MaxVectorRegs = PromoteAllocaToVectorMaxRegs; VGPRBudgetRatio = F.getFnAttributeAsParsedInteger( diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll index 3f499535400ef..ac30297770807 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll @@ -154,26 +154,31 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { ; CHECK-LABEL: realign_stack: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) -; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_cmp_lg_u32 0, s33 -; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi ; CHECK-NEXT: s_cmovk_i32 s33, 0x200 -; CHECK-NEXT: s_movk_i32 s32, 0x100 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: scratch_store_b32 off, v32, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112 ; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96 -; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80 +; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112 ; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64 -; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48 +; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80 ; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32 +; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48 ; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 ; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_movk_i32 s32, 0x100 ; CHECK-NEXT: s_cmovk_i32 s32, 0x300 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] ; CHECK-NEXT: s_alloc_vgpr 0 ; CHECK-NEXT: s_endpgm %v = alloca <32 x i32>, align 128, addrspace(5) + ; use volatile store to avoid promotion of alloca to registers + store volatile i32 0, ptr addrspace(5) %v store <32 x i32> %x, ptr addrspace(5) %v call amdgpu_gfx void @callee(i32 71) ret void diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll index cd428be729ae2..966e5c8f460dc 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll @@ -35,7 +35,8 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { ; CHECK-LABEL: {{^}}name: realign_stack ; CHECK: scratchReservedForDynamicVGPRs: 512 %v = alloca <32 x i32>, align 128, addrspace(5) - store <32 x i32> %x, ptr addrspace(5) %v + ; use volatile store to avoid promotion of alloca to registers + store volatile <32 x i32> %x, ptr addrspace(5) %v call amdgpu_gfx void @callee(i32 71) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll index ad42748ab3d60..c1123d7b515be 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll @@ -1,9 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --check-prefix=BASE --check-prefix=MAX16 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE %s --check-prefix=DEFAULT define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { +; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements( +; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX16-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX16-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; MAX16-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX24-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison +; MAX24-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX24-NEXT: ret void +; ; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements( ; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -19,12 +51,50 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 ; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 -; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; DEFAULT-NEXT: ret void ; -; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements( -; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [24 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) + %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { +; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX16-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX16-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; MAX16-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { ; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 @@ -36,18 +106,18 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; MAX24-NEXT: ret void ; -; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements( -; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison -; MAX32-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] -; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 -; MAX32-NEXT: ret void +; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison +; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() @@ -67,18 +137,24 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ret void } -define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { -; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( -; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { +; BASE-LABEL: define amdgpu_kernel void @i32_32_elements( +; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; BASE-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; BASE-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; BASE-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; BASE-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; BASE-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; BASE-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; BASE-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison -; BASE-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] -; BASE-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; BASE-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; BASE-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; BASE-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; BASE-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; BASE-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; BASE-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; BASE-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; BASE-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; BASE-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; BASE-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -87,40 +163,40 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %alloca = alloca [24 x i32], align 16, addrspace(5) - call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) - %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 - %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + %alloca = alloca [32 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false) + %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30 store i32 42, ptr addrspace(5) %gep.0 store i32 43, ptr addrspace(5) %gep.1 - %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 %load = load i32, ptr addrspace(5) %gep store i32 %load, ptr %out ret void } -define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { -; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements( -; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { -; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) -; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) -; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 -; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 -; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 -; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 -; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 -; DEFAULT-NEXT: ret void +define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { +; MAX16-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX16-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX16-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX16-NEXT: ret void ; -; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements( -; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { ; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 @@ -138,38 +214,6 @@ define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { ; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; MAX24-NEXT: ret void ; -; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements( -; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { -; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <32 x i32> poison -; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] -; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 -; MAX32-NEXT: ret void -; - %x = tail call i32 @llvm.amdgcn.workitem.id.x() - %y = tail call i32 @llvm.amdgcn.workitem.id.y() - %c1 = icmp uge i32 %x, 3 - %c2 = icmp uge i32 %y, 3 - %sel1 = select i1 %c1, i32 1, i32 2 - %sel2 = select i1 %c2, i32 0, i32 %sel1 - %alloca = alloca [32 x i32], align 16, addrspace(5) - call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false) - %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 - %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30 - store i32 42, ptr addrspace(5) %gep.0 - store i32 43, ptr addrspace(5) %gep.1 - %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 - %load = load i32, ptr addrspace(5) %gep - store i32 %load, ptr %out - ret void -} - -define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { ; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( ; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -182,38 +226,6 @@ define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { ; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] ; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; DEFAULT-NEXT: ret void -; -; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( -; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { -; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX24-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) -; MAX24-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) -; MAX24-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 -; MAX24-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 -; MAX24-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; MAX24-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 -; MAX24-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; MAX24-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 -; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 -; MAX24-NEXT: ret void -; -; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( -; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { -; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <32 x i32> poison -; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] -; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 -; MAX32-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() @@ -237,6 +249,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) -attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="16" } attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" } -attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" } +attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index aabd5df956837..ec04c6aa7f10d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --enable-var-scope %s declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0 declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll index 27ecc837ea732..ba4fedf5bb009 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -9,7 +9,7 @@ define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog entry: - %p = alloca [32 x i32], align 4, addrspace(5) + %p = alloca [64 x i32], align 4, addrspace(5) %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom %ret = load float, ptr addrspace(5) %arrayidx, align 4