Skip to content

Commit a80a198

Browse files
[SLP]Better support for copyable values in stores
Currently stores are sorted by the stored values instruction types, which do not include analysis for copyables. The compiler may miss some potential vectorization opportunities because of that. Patch adds detection of the copyables in stored values. Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: RKSimon Pull Request: #153213
1 parent 5176fb8 commit a80a198

File tree

3 files changed

+69
-90
lines changed

3 files changed

+69
-90
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25533,7 +25533,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
2553325533
template <typename T>
2553425534
static bool tryToVectorizeSequence(
2553525535
SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25536-
function_ref<bool(T *, T *)> AreCompatible,
25536+
function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
2553725537
function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
2553825538
bool MaxVFOnly, BoUpSLP &R) {
2553925539
bool Changed = false;
@@ -25555,7 +25555,7 @@ static bool tryToVectorizeSequence(
2555525555
auto *SameTypeIt = IncIt;
2555625556
while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
2555725557
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25558-
AreCompatible(*SameTypeIt, *IncIt))) {
25558+
AreCompatible(VL, *SameTypeIt))) {
2555925559
auto *I = dyn_cast<Instruction>(*SameTypeIt);
2556025560
++SameTypeIt;
2556125561
if (I && !R.isDeleted(I))
@@ -25753,10 +25753,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
2575325753
return compareCmp<false>(V, V2, *TLI, *DT);
2575425754
};
2575525755

25756-
auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
25757-
if (V1 == V2)
25756+
auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
25757+
if (VL.empty() || VL.back() == V1)
2575825758
return true;
25759-
return compareCmp<true>(V1, V2, *TLI, *DT);
25759+
return compareCmp<true>(V1, VL.back(), *TLI, *DT);
2576025760
};
2576125761

2576225762
SmallVector<Value *> Vals;
@@ -25962,9 +25962,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
2596225962
}
2596325963
return false;
2596425964
};
25965-
auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
25966-
if (V1 == V2)
25965+
auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
25966+
Value *V1) {
25967+
if (VL.empty() || V1 == VL.back())
2596725968
return true;
25969+
Value *V2 = VL.back();
2596825970
if (V1->getType() != V2->getType())
2596925971
return false;
2597025972
ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
@@ -26335,7 +26337,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
2633526337
V2->getValueOperand()->getValueID();
2633626338
};
2633726339

26338-
auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
26340+
bool SameParent = true;
26341+
auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26342+
if (VL.empty()) {
26343+
SameParent = true;
26344+
return true;
26345+
}
26346+
StoreInst *V2 = VL.back();
2633926347
if (V1 == V2)
2634026348
return true;
2634126349
if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
@@ -26346,15 +26354,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
2634626354
if (isa<UndefValue>(V1->getValueOperand()) ||
2634726355
isa<UndefValue>(V2->getValueOperand()))
2634826356
return true;
26349-
if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
26350-
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26351-
if (I1->getParent() != I2->getParent())
26352-
return false;
26353-
return getSameOpcode({I1, I2}, *TLI).valid();
26354-
}
2635526357
if (isa<Constant>(V1->getValueOperand()) &&
2635626358
isa<Constant>(V2->getValueOperand()))
2635726359
return true;
26360+
// Check if the operands of the stores can be vectorized. They can be
26361+
// vectorized, if they have compatible operands or have operands, which can
26362+
// be vectorized as copyables.
26363+
auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26364+
auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26365+
if (I1 || I2) {
26366+
// Accept only tail-following non-compatible values for now.
26367+
// TODO: investigate if it is possible to vectorize incompatible values,
26368+
// if the copyables are first in the list.
26369+
if (I1 && !I2)
26370+
return false;
26371+
SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26372+
SmallVector<Value *> NewVL(VL.size() + 1);
26373+
for (auto [SI, V] : zip(VL, NewVL))
26374+
V = SI->getValueOperand();
26375+
NewVL.back() = V1->getValueOperand();
26376+
InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26377+
InstructionsState S = Analysis.buildInstructionsState(
26378+
NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26379+
/*SkipSameCodeCheck=*/!SameParent);
26380+
if (S)
26381+
return true;
26382+
if (!SameParent)
26383+
return false;
26384+
}
2635826385
return V1->getValueOperand()->getValueID() ==
2635926386
V2->getValueOperand()->getValueID();
2636026387
};

llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,23 @@
44
define void @test_add_sdiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
55
; CHECK-LABEL: @test_add_sdiv(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
8-
; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
97
; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, ptr [[ARR2:%.*]], i32 2
108
; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, ptr [[ARR2]], i32 3
11-
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
12-
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
13-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
14-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A1:%.*]], i32 1
15-
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1146, i32 146>
9+
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP2_2]], align 4
10+
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP2_3]], align 4
1611
; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
17-
; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
12+
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR2]], align 4
13+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
14+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
15+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3
16+
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP3]]
1817
; CHECK-NEXT: [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]]
19-
; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
20-
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
21-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
22-
; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[ARR2]], align 4
23-
; CHECK-NEXT: store i32 [[RES2]], ptr [[GEP2_2]], align 4
24-
; CHECK-NEXT: store i32 [[RES3]], ptr [[GEP2_3]], align 4
18+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[RES2]], i32 2
19+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V3]], i32 3
20+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
21+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
22+
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP4]]
23+
; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[ARR3:%.*]], align 4
2524
; CHECK-NEXT: ret void
2625
;
2726
entry:

llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll

Lines changed: 15 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,12 @@ entry:
3333
}
3434

3535
define void @add1(ptr noalias %dst, ptr noalias %src) {
36-
; NON-POW2-LABEL: @add1(
37-
; NON-POW2-NEXT: entry:
38-
; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
39-
; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
40-
; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
41-
; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
42-
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
43-
; NON-POW2-NEXT: [[TMP2:%.*]] = add nsw <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
44-
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
45-
; NON-POW2-NEXT: ret void
46-
;
47-
; POW2-ONLY-LABEL: @add1(
48-
; POW2-ONLY-NEXT: entry:
49-
; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
50-
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
51-
; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
52-
; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
53-
; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
54-
; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
55-
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
56-
; POW2-ONLY-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1, i32 2>
57-
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
58-
; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
59-
; POW2-ONLY-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
60-
; POW2-ONLY-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
61-
; POW2-ONLY-NEXT: ret void
36+
; CHECK-LABEL: @add1(
37+
; CHECK-NEXT: entry:
38+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
39+
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
40+
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
41+
; CHECK-NEXT: ret void
6242
;
6343
entry:
6444
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -84,18 +64,9 @@ entry:
8464
define void @sub0(ptr noalias %dst, ptr noalias %src) {
8565
; CHECK-LABEL: @sub0(
8666
; CHECK-NEXT: entry:
87-
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
88-
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
89-
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
90-
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
91-
; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
92-
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
93-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
94-
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
95-
; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
96-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
97-
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
98-
; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
67+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
68+
; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
69+
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
9970
; CHECK-NEXT: ret void
10071
;
10172
entry:
@@ -182,18 +153,9 @@ entry:
182153
define void @addsub0(ptr noalias %dst, ptr noalias %src) {
183154
; CHECK-LABEL: @addsub0(
184155
; CHECK-NEXT: entry:
185-
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
186-
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
187-
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
188-
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
189-
; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
190-
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
191-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
192-
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
193-
; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
194-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
195-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 3>
196-
; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
156+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
157+
; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 3>
158+
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
197159
; CHECK-NEXT: ret void
198160
;
199161
entry:
@@ -220,18 +182,9 @@ entry:
220182
define void @addsub1(ptr noalias %dst, ptr noalias %src) {
221183
; CHECK-LABEL: @addsub1(
222184
; CHECK-NEXT: entry:
223-
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
224-
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
225-
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
226-
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 1>
227-
; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
228-
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
229-
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
230-
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
231-
; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
232-
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
233-
; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
234-
; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
185+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
186+
; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 1, i32 0, i32 3>
187+
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
235188
; CHECK-NEXT: ret void
236189
;
237190
entry:

0 commit comments

Comments
 (0)