Skip to content

Commit a7608eb

Browse files
committed
fixup! [SROA] Vector promote some memsets
1 parent 128c1ee commit a7608eb

File tree

9 files changed

+292
-251
lines changed

9 files changed

+292
-251
lines changed

clang/test/CodeGenOpenCL/amdgpu-nullptr.cl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -503,19 +503,21 @@ void cast_bool_generic(generic char* p) {
503503
*p = 0;
504504
}
505505

506-
// Test initialization of a struct with a private member.
506+
// Test initialize a struct using memset.
507+
// For large structures which is mostly zero, clang generats llvm.memset for
508+
// the zero part and store for non-zero members.
507509
typedef struct {
508510
long a, b, c, d;
509511
private char *p;
510512
} StructTy3;
511513

512-
// CHECK-LABEL: test_struct_private_member
513-
// CHECK: store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
514+
// CHECK-LABEL: test_memset_private
515+
// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
514516
// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
515517
// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
516518
// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
517519
// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
518-
void test_struct_private_member(private StructTy3 *ptr) {
520+
void test_memset_private(private StructTy3 *ptr) {
519521
StructTy3 S3 = {0, 0, 0, 0, 0};
520522
*ptr = S3;
521523
}

llvm/lib/Transforms/Scalar/SROA.cpp

Lines changed: 78 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
490490
for_each(DVRAssignMarkerRange, MigrateDbgAssign);
491491
}
492492

493+
static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
494+
uint64_t Size);
495+
static Type *getTypePartition(const AllocaInst &AI, const Partition &P);
496+
493497
namespace {
494498

495499
/// A custom IRBuilder inserter which prefixes all names, but only in
@@ -1011,37 +1015,33 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
10111015
return foldSelectInst(cast<SelectInst>(I));
10121016
}
10131017

1014-
static constexpr size_t getMaxNumFixedVectorElements() {
1015-
// FIXME: hack. Do we have a named constant for this?
1016-
// SDAG SDNode can't have more than 65535 operands.
1017-
return std::numeric_limits<unsigned short>::max();
1018-
}
1019-
10201018
/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
1021-
/// unable to do so.
1022-
static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
1023-
const DataLayout &DL) {
1019+
/// not viable.
1020+
static FixedVectorType *getVectorTypeFor(const DataLayout &DL, Type *PartTy,
1021+
const MemSetInst &II) {
1022+
auto *PartVecTy = dyn_cast_or_null<FixedVectorType>(PartTy);
1023+
if (!PartVecTy)
1024+
return nullptr;
1025+
1026+
const uint64_t PartVecSize = DL.getTypeStoreSize(PartVecTy).getFixedValue();
1027+
10241028
const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
10251029
if (!Length)
10261030
return nullptr;
10271031

10281032
const APInt &Val = Length->getValue();
1029-
if (Val.ugt(getMaxNumFixedVectorElements()))
1033+
if (Val.ugt(PartVecSize))
10301034
return nullptr;
10311035

10321036
// Element type will always be i8. TODO: Support
10331037
// llvm.experimental.memset.pattern?
1034-
uint64_t MemSetLen = Val.getZExtValue();
1035-
auto *VTy = FixedVectorType::get(II.getValue()->getType(), MemSetLen);
1036-
1037-
// FIXME: This is a workaround. Vector promotion sometimes inhibits our
1038-
// ability to merge constant stores. It seems to be related to the presence of
1039-
// alignment bytes. See
1040-
// test/Transforms/PhaseOrdering/X86/store-constant-merge.ll
1041-
if (MemSetLen != DL.getTypeAllocSize(VTy).getFixedValue())
1042-
return nullptr;
1038+
return FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
1039+
}
10431040

1044-
return VTy;
1041+
static FixedVectorType *getVectorTypeFor(const AllocaInst &AI,
1042+
const Partition &P,
1043+
const MemSetInst &II) {
1044+
return getVectorTypeFor(AI.getDataLayout(), getTypePartition(AI, P), II);
10451045
}
10461046

10471047
/// Builder for the alloca slices.
@@ -1055,6 +1055,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
10551055
using Base = PtrUseVisitor<SliceBuilder>;
10561056

10571057
const uint64_t AllocSize;
1058+
const AllocaInst &AI;
10581059
AllocaSlices &AS;
10591060

10601061
SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
@@ -1067,7 +1068,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
10671068
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
10681069
: PtrUseVisitor<SliceBuilder>(DL),
10691070
AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1070-
AS(AS) {}
1071+
AI(AI), AS(AS) {}
10711072

10721073
private:
10731074
void markAsDead(Instruction &I) {
@@ -1132,16 +1133,15 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
11321133
return Base::visitGetElementPtrInst(GEPI);
11331134
}
11341135

1135-
bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
1136-
return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1137-
}
1138-
11391136
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
11401137
uint64_t Size, bool IsVolatile) {
11411138
// We allow splitting of non-volatile loads and stores where the type is an
11421139
// integer type. These may be used to implement 'memcpy' or other "transfer
11431140
// of bits" patterns.
1144-
insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
1141+
bool IsSplittable =
1142+
Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1143+
1144+
insertUse(I, Offset, Size, IsSplittable);
11451145
}
11461146

11471147
void visitLoadInst(LoadInst &LI) {
@@ -1216,17 +1216,17 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
12161216
if (!IsOffsetKnown)
12171217
return PI.setAborted(&II);
12181218

1219-
bool Splittable;
1220-
1221-
if (getVectorTypeFor(II, DL))
1222-
Splittable = isSplittableMemOp(AS.AI.getAllocatedType(), II.isVolatile());
1223-
else
1224-
Splittable = (bool)Length;
1225-
1226-
insertUse(II, Offset,
1227-
Length ? Length->getLimitedValue()
1228-
: AllocSize - Offset.getLimitedValue(),
1229-
Splittable);
1219+
uint64_t Size = Length ? Length->getLimitedValue()
1220+
: AllocSize - Offset.getLimitedValue();
1221+
bool Splittable = (bool)Length;
1222+
if (Splittable) {
1223+
// Encourage the use of vector types by making this non-splittable if the
1224+
// memset corresponds to viable vector type.
1225+
Type *PartTy = getTypePartition(DL, AI.getAllocatedType(),
1226+
Offset.getLimitedValue(), Size);
1227+
Splittable = !getVectorTypeFor(DL, PartTy, II);
1228+
}
1229+
insertUse(II, Offset, Size, Splittable);
12301230
}
12311231

12321232
void visitMemTransferInst(MemTransferInst &II) {
@@ -2159,11 +2159,12 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
21592159
///
21602160
/// This function is called to test each entry in a partition which is slated
21612161
/// for a single slice.
2162-
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2163-
VectorType *Ty,
2162+
static bool isVectorPromotionViableForSlice(const AllocaInst &AI, Partition &P,
2163+
const Slice &S, VectorType *Ty,
21642164
uint64_t ElementSize,
2165-
const DataLayout &DL,
21662165
unsigned VScale) {
2166+
const DataLayout &DL = AI.getDataLayout();
2167+
21672168
// First validate the slice offsets.
21682169
uint64_t BeginOffset =
21692170
std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
@@ -2192,14 +2193,14 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
21922193
if (MI->isVolatile())
21932194
return false;
21942195

2195-
auto *II = dyn_cast<MemSetInst>(U->getUser());
2196-
if (!II && !S.isSplittable()) {
2196+
if (!S.isSplittable()) {
21972197
// Skip any non-memset unsplittable intrinsics.
2198-
return false;
2199-
}
2200-
if (II) {
2201-
// For memset, allow if we have a suitable vector type
2202-
Type *VTy = getVectorTypeFor(*II, DL);
2198+
auto *II = dyn_cast<MemSetInst>(U->getUser());
2199+
if (!II)
2200+
return false;
2201+
2202+
// For memset, allow if we have a viable vector type
2203+
Type *VTy = getVectorTypeFor(AI, P, *II);
22032204
if (!VTy)
22042205
return false;
22052206
if (!canConvertValue(DL, SliceTy, VTy))
@@ -2246,8 +2247,9 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
22462247
/// This implements the necessary checking for \c checkVectorTypesForPromotion
22472248
/// (and thus isVectorPromotionViable) over all slices of the alloca for the
22482249
/// given VectorType.
2249-
static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2250-
const DataLayout &DL, unsigned VScale) {
2250+
static bool checkVectorTypeForPromotion(const AllocaInst &AI, Partition &P,
2251+
VectorType *VTy, unsigned VScale) {
2252+
const DataLayout &DL = AI.getDataLayout();
22512253
uint64_t ElementSize =
22522254
DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
22532255

@@ -2260,11 +2262,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
22602262
ElementSize /= 8;
22612263

22622264
for (const Slice &S : P)
2263-
if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2265+
if (!isVectorPromotionViableForSlice(AI, P, S, VTy, ElementSize, VScale))
22642266
return false;
22652267

22662268
for (const Slice *S : P.splitSliceTails())
2267-
if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2269+
if (!isVectorPromotionViableForSlice(AI, P, *S, VTy, ElementSize, VScale))
22682270
return false;
22692271

22702272
return true;
@@ -2275,11 +2277,12 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
22752277
/// This implements the necessary checking for \c isVectorPromotionViable over
22762278
/// all slices of the alloca for the given VectorType.
22772279
static VectorType *
2278-
checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2280+
checkVectorTypesForPromotion(const AllocaInst &AI, Partition &P,
22792281
SmallVectorImpl<VectorType *> &CandidateTys,
22802282
bool HaveCommonEltTy, Type *CommonEltTy,
22812283
bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
22822284
VectorType *CommonVecPtrTy, unsigned VScale) {
2285+
const DataLayout &DL = AI.getDataLayout();
22832286
// If we didn't find a vector type, nothing to do here.
22842287
if (CandidateTys.empty())
22852288
return nullptr;
@@ -2347,24 +2350,27 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
23472350
CandidateTys.resize(1);
23482351
}
23492352

2353+
// FIXME: hack. Do we have a named constant for this?
2354+
// SDAG SDNode can't have more than 65535 operands.
23502355
llvm::erase_if(CandidateTys, [](VectorType *VTy) {
23512356
return cast<FixedVectorType>(VTy)->getNumElements() >
2352-
getMaxNumFixedVectorElements();
2357+
std::numeric_limits<unsigned short>::max();
23532358
});
23542359

23552360
for (VectorType *VTy : CandidateTys)
2356-
if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2361+
if (checkVectorTypeForPromotion(AI, P, VTy, VScale))
23572362
return VTy;
23582363

23592364
return nullptr;
23602365
}
23612366

23622367
static VectorType *createAndCheckVectorTypesForPromotion(
23632368
SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2364-
function_ref<void(Type *)> CheckCandidateType, Partition &P,
2365-
const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2369+
function_ref<void(Type *)> CheckCandidateType, const AllocaInst &AI,
2370+
Partition &P, SmallVectorImpl<VectorType *> &CandidateTys,
23662371
bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
23672372
bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2373+
const DataLayout &DL = AI.getDataLayout();
23682374
[[maybe_unused]] VectorType *OriginalElt =
23692375
CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
23702376
// Consider additional vector types where the element type size is a
@@ -2390,7 +2396,7 @@ static VectorType *createAndCheckVectorTypesForPromotion(
23902396
}
23912397

23922398
return checkVectorTypesForPromotion(
2393-
P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2399+
AI, P, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
23942400
HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
23952401
}
23962402

@@ -2403,10 +2409,11 @@ static VectorType *createAndCheckVectorTypesForPromotion(
24032409
/// SSA value. We only can ensure this for a limited set of operations, and we
24042410
/// don't want to do the rewrites unless we are confident that the result will
24052411
/// be promotable, so we have an early test here.
2406-
static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
2412+
static VectorType *isVectorPromotionViable(const AllocaInst &AI, Partition &P,
24072413
unsigned VScale) {
24082414
// Collect the candidate types for vector-based promotion. Also track whether
24092415
// we have different element types.
2416+
const DataLayout &DL = AI.getDataLayout();
24102417
SmallVector<VectorType *, 4> CandidateTys;
24112418
SetVector<Type *> LoadStoreTys;
24122419
SetVector<Type *> DeferredTys;
@@ -2452,7 +2459,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
24522459
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
24532460
Ty = SI->getValueOperand()->getType();
24542461
else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser())) {
2455-
Ty = getVectorTypeFor(*II, DL);
2462+
Ty = getVectorTypeFor(AI, P, *II);
24562463
if (!Ty)
24572464
continue;
24582465
} else
@@ -2473,14 +2480,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
24732480

24742481
SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
24752482
if (auto *VTy = createAndCheckVectorTypesForPromotion(
2476-
LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2483+
LoadStoreTys, CandidateTysCopy, CheckCandidateType, AI, P,
24772484
CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
24782485
HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
24792486
return VTy;
24802487

24812488
CandidateTys.clear();
24822489
return createAndCheckVectorTypesForPromotion(
2483-
DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2490+
DeferredTys, CandidateTysCopy, CheckCandidateType, AI, P, CandidateTys,
24842491
HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
24852492
CommonVecPtrTy, VScale);
24862493
}
@@ -4465,6 +4472,13 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
44654472
return SubTy;
44664473
}
44674474

4475+
static Type *getTypePartition(const AllocaInst &AI, const Partition &P) {
4476+
if (P.empty())
4477+
return nullptr;
4478+
return getTypePartition(AI.getDataLayout(), AI.getAllocatedType(),
4479+
P.beginOffset(), P.size());
4480+
}
4481+
44684482
/// Pre-split loads and stores to simplify rewriting.
44694483
///
44704484
/// We want to break up the splittable load+store pairs as much as
@@ -5012,12 +5026,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
50125026

50135027
// If the common use types are not viable for promotion then attempt to find
50145028
// another type that is viable.
5015-
if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
5029+
if (SliceVecTy && !checkVectorTypeForPromotion(AI, P, SliceVecTy, VScale))
50165030
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
50175031
P.beginOffset(), P.size())) {
50185032
VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
50195033
if (TypePartitionVecTy &&
5020-
checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
5034+
checkVectorTypeForPromotion(AI, P, TypePartitionVecTy, VScale))
50215035
SliceTy = TypePartitionTy;
50225036
}
50235037

@@ -5028,7 +5042,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
50285042
bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
50295043

50305044
VectorType *VecTy =
5031-
IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
5045+
IsIntegerPromotable ? nullptr : isVectorPromotionViable(AI, P, VScale);
50325046
if (VecTy)
50335047
SliceTy = VecTy;
50345048

llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
;; Allocas have been promoted - the linked dbg.assigns have been removed.
2222

2323
;; | V3i point = {0, 0, 0};
24-
; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
24+
; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
25+
; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
2526

2627
;; point.z = 5000;
2728
; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -31,20 +32,17 @@
3132
;; local.other.x = global.other.x
3233
;; local.other.y = global.other.y
3334
;; local.other.z = global.other.z
34-
; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
35+
; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
3536
; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
3637
; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
37-
; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
38+
; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
3839
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
3940
; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
4041

4142
;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
4243
;; other is now 3 scalars:
4344
;; point.y = other.x
44-
; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
45-
; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
46-
; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
47-
45+
; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
4846
;;
4947
;; point.z = other.y
5048
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),

0 commit comments

Comments
 (0)