Skip to content

Commit 096fde6

Browse files
committed
[SROA] Vector promote some memsets
1 parent 08aedf7 commit 096fde6

File tree

7 files changed

+143
-100
lines changed

7 files changed

+143
-100
lines changed

clang/test/CodeGenOpenCL/amdgpu-nullptr.cl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
503503
*p = 0;
504504
}
505505

506-
// Test initialize a struct using memset.
507-
// For large structures which is mostly zero, clang generats llvm.memset for
508-
// the zero part and store for non-zero members.
506+
// Test initialization of a struct with a private member.
509507
typedef struct {
510508
long a, b, c, d;
511509
private char *p;
512510
} StructTy3;
513511

514-
// CHECK-LABEL: test_memset_private
515-
// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
512+
// CHECK-LABEL: test_struct_private_member
513+
// CHECK: store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
516514
// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
517515
// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
518516
// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
519517
// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
520-
void test_memset_private(private StructTy3 *ptr) {
518+
void test_struct_private_member(private StructTy3 *ptr) {
521519
StructTy3 S3 = {0, 0, 0, 0, 0};
522520
*ptr = S3;
523521
}

llvm/lib/Transforms/Scalar/SROA.cpp

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
10111011
return foldSelectInst(cast<SelectInst>(I));
10121012
}
10131013

1014+
/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
1015+
/// unable to do so.
1016+
static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
1017+
const DataLayout &DL) {
1018+
const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1019+
if (!Length)
1020+
return nullptr;
1021+
1022+
APInt Val = Length->getValue();
1023+
if (Val.ugt(std::numeric_limits<unsigned>::max()))
1024+
return nullptr;
1025+
1026+
auto *VTy =
1027+
FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
1028+
if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
1029+
return nullptr;
1030+
1031+
return VTy;
1032+
}
1033+
10141034
/// Builder for the alloca slices.
10151035
///
10161036
/// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
10991119
return Base::visitGetElementPtrInst(GEPI);
11001120
}
11011121

1122+
bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
1123+
return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1124+
}
1125+
11021126
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
11031127
uint64_t Size, bool IsVolatile) {
11041128
// We allow splitting of non-volatile loads and stores where the type is an
11051129
// integer type. These may be used to implement 'memcpy' or other "transfer
11061130
// of bits" patterns.
1107-
bool IsSplittable =
1108-
Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109-
1110-
insertUse(I, Offset, Size, IsSplittable);
1131+
insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
11111132
}
11121133

11131134
void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
11701191
if (!IsOffsetKnown)
11711192
return PI.setAborted(&II);
11721193

1194+
auto IsSplittable = [&]() {
1195+
FixedVectorType *VTy = getVectorTypeFor(II, DL);
1196+
Type *ATy = AS.AI.getAllocatedType();
1197+
1198+
if (!Length)
1199+
return false;
1200+
if (!VTy)
1201+
return true;
1202+
if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
1203+
return true;
1204+
return isSplittableMemOp(ATy, II.isVolatile());
1205+
};
1206+
11731207
insertUse(II, Offset,
11741208
Length ? Length->getLimitedValue()
11751209
: AllocSize - Offset.getLimitedValue(),
1176-
(bool)Length);
1210+
IsSplittable());
11771211
}
11781212

11791213
void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
20722106
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
20732107
if (MI->isVolatile())
20742108
return false;
2075-
if (!S.isSplittable())
2076-
return false; // Skip any unsplittable intrinsics.
2109+
2110+
auto *II = dyn_cast<MemSetInst>(U->getUser());
2111+
if (!II && !S.isSplittable()) {
2112+
// Skip any non-memset unsplittable intrinsics.
2113+
return false;
2114+
}
2115+
if (II) {
2116+
// For memset, allow if we have a suitable vector type
2117+
Type *VTy = getVectorTypeFor(*II, DL);
2118+
if (!VTy)
2119+
return false;
2120+
if (!canConvertValue(DL, SliceTy, VTy))
2121+
return false;
2122+
}
20772123
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
20782124
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
20792125
return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
23162362

23172363
// Put load and store types into a set for de-duplication.
23182364
for (const Slice &S : P) {
2319-
Type *Ty;
2365+
Type *Ty = nullptr;
23202366
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
23212367
Ty = LI->getType();
23222368
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
23232369
Ty = SI->getValueOperand()->getType();
2324-
else
2370+
else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
2371+
Ty = getVectorTypeFor(*II, DL);
2372+
2373+
if (!Ty)
23252374
continue;
23262375

23272376
auto CandTy = Ty->getScalarType();

llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121
;; Allocas have been promoted - the linked dbg.assigns have been removed.
2222

2323
;; | V3i point = {0, 0, 0};
24-
; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
25-
; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
24+
; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
2625

2726
;; point.z = 5000;
2827
; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
3231
;; local.other.x = global.other.x
3332
;; local.other.y = global.other.y
3433
;; local.other.z = global.other.z
35-
; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
34+
; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
3635
; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
3736
; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
38-
; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
37+
; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
3938
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
4039
; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
4140

4241
;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
4342
;; other is now 3 scalars:
4443
;; point.y = other.x
45-
; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
44+
; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
45+
; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
46+
; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
47+
4648
;;
4749
;; point.z = other.y
4850
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),

llvm/test/DebugInfo/X86/sroasplit-5.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
2121
;
2222
; There should be no debug info for the padding.
2323
; CHECK-NOT: DW_OP_LLVM_fragment, 56
24-
; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
25-
; CHECK-NOT: DW_OP_LLVM_fragment, 56
26-
; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
24+
; CHECK: ![[a:[0-9]+]], !DIExpression(),
2725
; CHECK-NOT: DW_OP_LLVM_fragment, 56
26+
; CHECK: ![[a]] = !DILocalVariable(name: "a",
2827
%struct.prog_src_register = type { i32, i24 }
2928

3029
; Function Attrs: nounwind

llvm/test/Transforms/SROA/basictest.ll

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -529,8 +529,9 @@ entry:
529529
define ptr @test10() {
530530
; CHECK-LABEL: @test10(
531531
; CHECK-NEXT: entry:
532-
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr null to i64
533-
; CHECK-NEXT: ret ptr null
532+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
533+
; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
534+
; CHECK-NEXT: ret ptr [[TMP1]]
534535
;
535536
entry:
536537
%a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
10751076
;
10761077
; CHECK-LABEL: @PR14059.1(
10771078
; CHECK-NEXT: entry:
1078-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double undef to i64
1079-
; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
1080-
; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
1081-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
1082-
; CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
1083-
; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
1084-
; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
1085-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
1086-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
1087-
; CHECK-NEXT: [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
1088-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
1089-
; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
1090-
; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
1091-
; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
1092-
; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
1093-
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
1094-
; CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
1095-
; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
1096-
; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
1097-
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
1079+
; CHECK-NEXT: [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
1080+
; CHECK-NEXT: [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
1081+
; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
1082+
; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
1083+
; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
1084+
; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
1085+
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
10981086
; CHECK-NEXT: [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
10991087
; CHECK-NEXT: [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
11001088
; CHECK-NEXT: store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
13321320
; CHECK-NEXT: entry:
13331321
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
13341322
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
1335-
; CHECK-NEXT: i32 4, label [[BB4:%.*]]
1336-
; CHECK-NEXT: i32 3, label [[BB3:%.*]]
1337-
; CHECK-NEXT: i32 2, label [[BB2:%.*]]
1338-
; CHECK-NEXT: i32 1, label [[BB1:%.*]]
1323+
; CHECK-NEXT: i32 4, label [[BB4:%.*]]
1324+
; CHECK-NEXT: i32 3, label [[BB3:%.*]]
1325+
; CHECK-NEXT: i32 2, label [[BB2:%.*]]
1326+
; CHECK-NEXT: i32 1, label [[BB1:%.*]]
13391327
; CHECK-NEXT: ]
13401328
; CHECK: bb4:
13411329
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3

llvm/test/Transforms/SROA/slice-width.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
6868

6969
define void @memset_fp80_padding() {
7070
; CHECK-LABEL: @memset_fp80_padding(
71-
; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
72-
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
73-
; CHECK-NEXT: store i64 -1, ptr @i64_sink, align 4
71+
; CHECK-NEXT: [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
72+
; CHECK-NEXT: store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
7473
; CHECK-NEXT: ret void
7574
;
7675
%x = alloca %union.Foo

0 commit comments

Comments
 (0)