Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 55 additions & 41 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,28 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
return I;
}

/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
/// type is non-homogeneous.
static Type *getHomogeneousType(Type *Ty) {
if (auto *VectorTy = dyn_cast<FixedVectorType>(Ty))
return VectorTy->getElementType();
if (auto *ArrayTy = dyn_cast<ArrayType>(Ty))
return getHomogeneousType(ArrayTy->getElementType());
if (auto *StructTy = dyn_cast<StructType>(Ty)) {
if (StructTy->getNumElements() == 0)
return nullptr;

auto *Iter = StructTy->element_begin();
Type *HTy = getHomogeneousType(*Iter);
for (; Iter != StructTy->element_end(); ++Iter)
if (getHomogeneousType(*Iter) != HTy)
return nullptr;

return HTy;
}
return Ty;
}

// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
Expand All @@ -828,42 +850,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
}

Type *AllocaTy = Alloca.getAllocatedType();
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
uint64_t NumElems = 1;
Type *ElemTy;
do {
NumElems *= ArrayTy->getNumElements();
ElemTy = ArrayTy->getElementType();
} while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));

// Check for array of vectors
auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
if (InnerVectorTy) {
NumElems *= InnerVectorTy->getNumElements();
ElemTy = InnerVectorTy->getElementType();
}
Type *ElemTy = getHomogeneousType(AllocaTy);

if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
if (ElementSize > 0) {
unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
// Expand vector if required to match padding of inner type,
// i.e. odd size subvectors.
// Storage size of new vector must match that of alloca for correct
// behaviour of byte offsets and GEP computation.
if (NumElems * ElementSize != AllocaSize)
NumElems = AllocaSize / ElementSize;
if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
VectorTy = FixedVectorType::get(ElemTy, NumElems);
}
}
if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}

if (!VectorTy) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
if (ElementSizeInBits == 0) {
LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements.");
return false;
}
if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
"does not match the type's size\n");
return false;
}
unsigned ElementSize = ElementSizeInBits / 8;
if (ElementSize == 0)
return false;

// Calculate the size of the corresponding vector, accounting for padding of
// inner types, e.g., odd-sized subvectors. Storage size of new vector must
// match that of alloca for correct behaviour of byte offsets and GEP
// computation.
unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
unsigned NumElems = AllocaSize / ElementSize;
if (NumElems == 0) {
LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type.");
return false;
}
if (NumElems * ElementSize != AllocaSize) {
LLVM_DEBUG(dbgs() << " Cannot convert type into vector of the same size.");
return false;
}
auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
assert(VectorTy && "Failed to create vector type.");

const unsigned MaxElements =
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
Expand Down Expand Up @@ -895,15 +918,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {

LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");

Type *VecEltTy = VectorTy->getElementType();
unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
"does not match the type's size\n");
return false;
}
unsigned ElementSize = ElementSizeInBits / 8;
assert(ElementSize > 0);
for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());

Expand Down Expand Up @@ -943,7 +957,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");

Expand Down
263 changes: 263 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 %s | FileCheck %s

declare void @clobber_i8(i8)

define void @test_v4i8(i64 %idx) {
; CHECK-LABEL: define void @test_v4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca <4 x i8>, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_a4i8(i64 %idx) {
; CHECK-LABEL: define void @test_a4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca [4 x i8], align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_a2v4i8(i64 %idx) {
; CHECK-LABEL: define void @test_a2v4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought SROA already tried to flatten out aggregate into simple arrays. Why do we need to do this? We don't need to optimally handle all IR, just post-optimized IR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behaviour is actually already implemented in the AMDGPUPromoteAllocaToVector pass. I put the test in to ensure this behaviour didn't change with this PR.

If I understand correctly, the problem with SROA is that it tends to prefer promoting types to scalar registers, rather than vectors, which is why the AMDGPUPromoteAllocaToVector pass is queued to occur before SROA in the pipeline.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If SROA can break the value up it's better, we should be running this after SROA

%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_a2v3i8(i64 %idx) {
; CHECK-LABEL: define void @test_a2v3i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_a2a4i8(i64 %idx) {
; CHECK-LABEL: define void @test_a2a4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_a2a3i8(i64 %idx) {
; CHECK-LABEL: define void @test_a2a3i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s1v4i8(i64 %idx) {
; CHECK-LABEL: define void @test_s1v4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {<4 x i8>}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s1a4i8(i64 %idx) {
; CHECK-LABEL: define void @test_s1a4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {[4 x i8]}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s4i8(i64 %idx) {
; CHECK-LABEL: define void @test_s4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s2v4i8(i64 %idx) {
; CHECK-LABEL: define void @test_s2v4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s2v2i8v4i8(i64 %idx) {
; CHECK-LABEL: define void @test_s2v2i8v4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s2v2i8v3i8(i64 %idx) {
; CHECK-LABEL: define void @test_s2v2i8v3i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s2s2i8s4i8(i64 %idx) {
; CHECK-LABEL: define void @test_s2s2i8s4i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s2s2i8s3i8(i64 %idx) {
; CHECK-LABEL: define void @test_s2s2i8s3i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <5 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <5 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

define void @test_s3i8s1i8v2i8(i64 %idx) {
; CHECK-LABEL: define void @test_s3i8s1i8v2i8(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
; CHECK-NEXT: ret void
;
%stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

; heterogeneous element types are not supported
define void @test_heterogeneous(i64 %idx) {
; CHECK-LABEL: define void @test_heterogeneous(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
; CHECK-NEXT: ret void
;
%stack = alloca {i8, i8, i16}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}

; empty structs are not supported
define void @test_empty(i64 %idx) {
; CHECK-LABEL: define void @test_empty(
; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
; CHECK-NEXT: ret void
;
%stack = alloca {i8, {}}, align 4, addrspace(5)
%ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
%val = load i8, ptr addrspace(5) %ptr, align 1
call void @clobber_i8(i8 %val)
ret void
}
Loading