Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 96 additions & 27 deletions llvm/lib/Transforms/Scalar/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
for_each(DVRAssignMarkerRange, MigrateDbgAssign);
}

static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
uint64_t Size);
static Type *getTypePartition(const AllocaInst &AI, const Partition &P);

namespace {

/// A custom IRBuilder inserter which prefixes all names, but only in
Expand Down Expand Up @@ -1011,6 +1015,35 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}

/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
/// not viable.
static FixedVectorType *getVectorTypeFor(const DataLayout &DL, Type *PartTy,
const MemSetInst &II) {
auto *PartVecTy = dyn_cast_or_null<FixedVectorType>(PartTy);
if (!PartVecTy)
return nullptr;

const uint64_t PartVecSize = DL.getTypeStoreSize(PartVecTy).getFixedValue();

const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
if (!Length)
return nullptr;

const APInt &Val = Length->getValue();
if (Val.ugt(PartVecSize))
return nullptr;

// Element type will always be i8. TODO: Support
// llvm.experimental.memset.pattern?
return FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
}

static FixedVectorType *getVectorTypeFor(const AllocaInst &AI,
const Partition &P,
const MemSetInst &II) {
return getVectorTypeFor(AI.getDataLayout(), getTypePartition(AI, P), II);
}

/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
Expand All @@ -1022,6 +1055,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
using Base = PtrUseVisitor<SliceBuilder>;

const uint64_t AllocSize;
const AllocaInst &AI;
AllocaSlices &AS;

SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
Expand All @@ -1034,7 +1068,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
: PtrUseVisitor<SliceBuilder>(DL),
AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
AS(AS) {}
AI(AI), AS(AS) {}

private:
void markAsDead(Instruction &I) {
Expand Down Expand Up @@ -1182,10 +1216,17 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);

insertUse(II, Offset,
Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue(),
(bool)Length);
uint64_t Size = Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue();
bool Splittable = (bool)Length;
if (Splittable) {
// Encourage the use of vector types by making this non-splittable if the
// memset corresponds to viable vector type.
Type *PartTy = getTypePartition(DL, AI.getAllocatedType(),
Offset.getLimitedValue(), Size);
Splittable = !getVectorTypeFor(DL, PartTy, II);
}
insertUse(II, Offset, Size, Splittable);
}

void visitMemTransferInst(MemTransferInst &II) {
Expand Down Expand Up @@ -2118,11 +2159,12 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
///
/// This function is called to test each entry in a partition which is slated
/// for a single slice.
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
VectorType *Ty,
static bool isVectorPromotionViableForSlice(const AllocaInst &AI, Partition &P,
const Slice &S, VectorType *Ty,
uint64_t ElementSize,
const DataLayout &DL,
unsigned VScale) {
const DataLayout &DL = AI.getDataLayout();

// First validate the slice offsets.
uint64_t BeginOffset =
std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
Expand Down Expand Up @@ -2150,8 +2192,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile())
return false;
if (!S.isSplittable())
return false; // Skip any unsplittable intrinsics.

if (!S.isSplittable()) {
// Skip any non-memset unsplittable intrinsics.
auto *II = dyn_cast<MemSetInst>(U->getUser());
if (!II)
return false;

// For memset, allow if we have a viable vector type
Type *VTy = getVectorTypeFor(AI, P, *II);
if (!VTy)
return false;
if (!canConvertValue(DL, SliceTy, VTy))
return false;
}
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
Expand Down Expand Up @@ -2193,8 +2247,9 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
/// This implements the necessary checking for \c checkVectorTypesForPromotion
/// (and thus isVectorPromotionViable) over all slices of the alloca for the
/// given VectorType.
static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
const DataLayout &DL, unsigned VScale) {
static bool checkVectorTypeForPromotion(const AllocaInst &AI, Partition &P,
VectorType *VTy, unsigned VScale) {
const DataLayout &DL = AI.getDataLayout();
uint64_t ElementSize =
DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();

Expand All @@ -2207,11 +2262,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
ElementSize /= 8;

for (const Slice &S : P)
if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
if (!isVectorPromotionViableForSlice(AI, P, S, VTy, ElementSize, VScale))
return false;

for (const Slice *S : P.splitSliceTails())
if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
if (!isVectorPromotionViableForSlice(AI, P, *S, VTy, ElementSize, VScale))
return false;

return true;
Expand All @@ -2222,11 +2277,12 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
/// This implements the necessary checking for \c isVectorPromotionViable over
/// all slices of the alloca for the given VectorType.
static VectorType *
checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
checkVectorTypesForPromotion(const AllocaInst &AI, Partition &P,
SmallVectorImpl<VectorType *> &CandidateTys,
bool HaveCommonEltTy, Type *CommonEltTy,
bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
VectorType *CommonVecPtrTy, unsigned VScale) {
const DataLayout &DL = AI.getDataLayout();
// If we didn't find a vector type, nothing to do here.
if (CandidateTys.empty())
return nullptr;
Expand Down Expand Up @@ -2302,18 +2358,19 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
});

for (VectorType *VTy : CandidateTys)
if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
if (checkVectorTypeForPromotion(AI, P, VTy, VScale))
return VTy;

return nullptr;
}

static VectorType *createAndCheckVectorTypesForPromotion(
SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
function_ref<void(Type *)> CheckCandidateType, Partition &P,
const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
function_ref<void(Type *)> CheckCandidateType, const AllocaInst &AI,
Partition &P, SmallVectorImpl<VectorType *> &CandidateTys,
bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
const DataLayout &DL = AI.getDataLayout();
[[maybe_unused]] VectorType *OriginalElt =
CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
// Consider additional vector types where the element type size is a
Expand All @@ -2339,7 +2396,7 @@ static VectorType *createAndCheckVectorTypesForPromotion(
}

return checkVectorTypesForPromotion(
P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
AI, P, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
}

Expand All @@ -2352,10 +2409,11 @@ static VectorType *createAndCheckVectorTypesForPromotion(
/// SSA value. We only can ensure this for a limited set of operations, and we
/// don't want to do the rewrites unless we are confident that the result will
/// be promotable, so we have an early test here.
static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
static VectorType *isVectorPromotionViable(const AllocaInst &AI, Partition &P,
unsigned VScale) {
// Collect the candidate types for vector-based promotion. Also track whether
// we have different element types.
const DataLayout &DL = AI.getDataLayout();
SmallVector<VectorType *, 4> CandidateTys;
SetVector<Type *> LoadStoreTys;
SetVector<Type *> DeferredTys;
Expand Down Expand Up @@ -2395,12 +2453,16 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,

// Put load and store types into a set for de-duplication.
for (const Slice &S : P) {
Type *Ty;
Type *Ty = nullptr;
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
Ty = LI->getType();
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
Ty = SI->getValueOperand()->getType();
else
else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser())) {
Ty = getVectorTypeFor(AI, P, *II);
if (!Ty)
continue;
} else
continue;

auto CandTy = Ty->getScalarType();
Expand All @@ -2418,14 +2480,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,

SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
if (auto *VTy = createAndCheckVectorTypesForPromotion(
LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
LoadStoreTys, CandidateTysCopy, CheckCandidateType, AI, P,
CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
return VTy;

CandidateTys.clear();
return createAndCheckVectorTypesForPromotion(
DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
DeferredTys, CandidateTysCopy, CheckCandidateType, AI, P, CandidateTys,
HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
CommonVecPtrTy, VScale);
}
Expand Down Expand Up @@ -4410,6 +4472,13 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
return SubTy;
}

static Type *getTypePartition(const AllocaInst &AI, const Partition &P) {
if (P.empty())
return nullptr;
return getTypePartition(AI.getDataLayout(), AI.getAllocatedType(),
P.beginOffset(), P.size());
}

/// Pre-split loads and stores to simplify rewriting.
///
/// We want to break up the splittable load+store pairs as much as
Expand Down Expand Up @@ -4957,12 +5026,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,

// If the common use types are not viable for promotion then attempt to find
// another type that is viable.
if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
if (SliceVecTy && !checkVectorTypeForPromotion(AI, P, SliceVecTy, VScale))
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size())) {
VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
if (TypePartitionVecTy &&
checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
checkVectorTypeForPromotion(AI, P, TypePartitionVecTy, VScale))
SliceTy = TypePartitionTy;
}

Expand All @@ -4973,7 +5042,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);

VectorType *VecTy =
IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
IsIntegerPromotable ? nullptr : isVectorPromotionViable(AI, P, VScale);
if (VecTy)
SliceTy = VecTy;

Expand Down
6 changes: 0 additions & 6 deletions llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,6 @@ bb:
define amdgpu_kernel void @test_array_vector() #0 {
; CHECK-LABEL: @test_array_vector(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT: br label [[BB:%.*]]
Expand Down Expand Up @@ -318,8 +316,6 @@ bb:
define amdgpu_kernel void @test_array_vector2() #0 {
; CHECK-LABEL: @test_array_vector2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT: br label [[BB:%.*]]
Expand Down Expand Up @@ -355,12 +351,10 @@ define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 {
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false)
; CHECK-NEXT: [[DATA1:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT: [[DATA2:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT: [[DATA3:%.*]] = load float, ptr undef, align 4
Expand Down
Loading
Loading