Skip to content

Commit 0a5be0f

Browse files
[SLP]Enable Sub as a base instruction in copyables
Patch adds support for sub instructions as main instruction in copyables elements. Also, adds a check if the base instruction is not profitable for the selection if at least one instruction with the main opcode is used as an immediate operand. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm#163231
1 parent 2bcb3f8 commit 0a5be0f

File tree

4 files changed

+96
-38
lines changed

4 files changed

+96
-38
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 87 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4869,6 +4869,8 @@ class BoUpSLP {
48694869
assert(hasValidDependencies() &&
48704870
"increment of unscheduled deps would be meaningless");
48714871
UnscheduledDeps += Incr;
4872+
assert(UnscheduledDeps >= 0 &&
4873+
"Expected valid number of unscheduled deps");
48724874
return UnscheduledDeps;
48734875
}
48744876

@@ -5331,6 +5333,28 @@ class BoUpSLP {
53315333
// Check all tree entries, if they have operands replaced by copyable
53325334
// data.
53335335
for (TreeEntry *TE : Entries) {
5336+
unsigned Inc = 0;
5337+
bool IsNonSchedulableWithParentPhiNode =
5338+
TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5339+
TE->UserTreeIndex.UserTE->hasState() &&
5340+
TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5341+
// Count the number of unique phi nodes, which are the parent for
5342+
// parent entry, and exit, if all the unique phis are processed.
5343+
if (IsNonSchedulableWithParentPhiNode) {
5344+
SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5345+
const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5346+
for (Value *V : ParentTE->Scalars) {
5347+
auto *PHI = dyn_cast<PHINode>(V);
5348+
if (!PHI)
5349+
continue;
5350+
if (ParentsUniqueUsers.insert(PHI).second &&
5351+
is_contained(PHI->incoming_values(), User))
5352+
++Inc;
5353+
}
5354+
} else {
5355+
Inc = 1;
5356+
}
5357+
53345358
// Check if the user is commutative.
53355359
// The commutatives are handled later, as their operands can be
53365360
// reordered.
@@ -5346,11 +5370,11 @@ class BoUpSLP {
53465370
if (!getScheduleCopyableData(EI, Op))
53475371
continue;
53485372
// Found copyable operand - continue.
5349-
++OpCnt;
5373+
OpCnt += Inc;
53505374
continue;
53515375
}
5352-
++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5353-
.first->getSecond();
5376+
PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5377+
.first->getSecond() += Inc;
53545378
}
53555379
}
53565380
if (PotentiallyReorderedEntriesCount.empty())
@@ -5360,21 +5384,44 @@ class BoUpSLP {
53605384
});
53615385
// Check the commutative/cmp entries.
53625386
for (auto &P : PotentiallyReorderedEntriesCount) {
5387+
SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5388+
bool IsNonSchedulableWithParentPhiNode =
5389+
P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5390+
P.first->UserTreeIndex.UserTE->hasState() &&
5391+
P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
53635392
auto *It = find(P.first->Scalars, User);
5364-
assert(It != P.first->Scalars.end() && "User is not in the tree entry");
5365-
int Lane = std::distance(P.first->Scalars.begin(), It);
5366-
assert(Lane >= 0 && "Lane is not found");
5367-
if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5368-
Lane = P.first->ReorderIndices[Lane];
5369-
assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5370-
"Couldn't find extract lane");
5371-
for (unsigned OpIdx :
5372-
seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(
5373-
P.first->getMainOp()))) {
5374-
if (P.first->getOperand(OpIdx)[Lane] == Op &&
5375-
getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5376-
--P.getSecond();
5377-
}
5393+
do {
5394+
assert(It != P.first->Scalars.end() &&
5395+
"User is not in the tree entry");
5396+
int Lane = std::distance(P.first->Scalars.begin(), It);
5397+
assert(Lane >= 0 && "Lane is not found");
5398+
if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5399+
Lane = P.first->ReorderIndices[Lane];
5400+
assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5401+
"Couldn't find extract lane");
5402+
// Count the number of unique phi nodes, which are the parent for
5403+
// parent entry, and exit, if all the unique phis are processed.
5404+
if (IsNonSchedulableWithParentPhiNode) {
5405+
const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5406+
Value *User = ParentTE->Scalars[Lane];
5407+
if (!ParentsUniqueUsers.insert(User).second) {
5408+
It =
5409+
find(make_range(std::next(It), P.first->Scalars.end()), User);
5410+
continue;
5411+
}
5412+
}
5413+
for (unsigned OpIdx :
5414+
seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(
5415+
P.first->getMainOp()))) {
5416+
if (P.first->getOperand(OpIdx)[Lane] == Op &&
5417+
getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5418+
--P.getSecond();
5419+
}
5420+
// If parent node is schedulable, it will be handled correctly.
5421+
if (!IsNonSchedulableWithParentPhiNode)
5422+
break;
5423+
It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5424+
} while (It != P.first->Scalars.end());
53785425
}
53795426
return all_of(PotentiallyReorderedEntriesCount,
53805427
[&](const std::pair<const TreeEntry *, unsigned> &P) {
@@ -5648,8 +5695,11 @@ class BoUpSLP {
56485695
const TreeEntry *ParentTE =
56495696
Bundle->getTreeEntry()->UserTreeIndex.UserTE;
56505697
Value *User = ParentTE->Scalars[Lane];
5651-
if (!ParentsUniqueUsers.insert(User).second)
5652-
break;
5698+
if (!ParentsUniqueUsers.insert(User).second) {
5699+
It = std::find(std::next(It),
5700+
Bundle->getTreeEntry()->Scalars.end(), In);
5701+
continue;
5702+
}
56535703
}
56545704

56555705
for (unsigned OpIdx :
@@ -10745,10 +10795,11 @@ class InstructionsCompatibilityAnalysis {
1074510795
/// Checks if the opcode is supported as the main opcode for copyable
1074610796
/// elements.
1074710797
static bool isSupportedOpcode(const unsigned Opcode) {
10748-
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10749-
Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10750-
Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10751-
Opcode == Instruction::Or || Opcode == Instruction::Xor;
10798+
return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10799+
Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10800+
Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10801+
Opcode == Instruction::And || Opcode == Instruction::Or ||
10802+
Opcode == Instruction::Xor;
1075210803
}
1075310804

1075410805
/// Identifies the best candidate value, which represents main opcode
@@ -10808,8 +10859,12 @@ class InstructionsCompatibilityAnalysis {
1080810859
for (const auto &P : Candidates) {
1080910860
if (P.second.size() < BestOpcodeNum)
1081010861
continue;
10862+
// If have inner dependencies - skip.
10863+
if (any_of(P.second,
10864+
[&](Instruction *I) { return Operands.contains(I); }))
10865+
continue;
1081110866
for (Instruction *I : P.second) {
10812-
if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) {
10867+
if (IsSupportedInstruction(I, AnyUndef)) {
1081310868
MainOp = I;
1081410869
BestOpcodeNum = P.second.size();
1081510870
break;
@@ -11069,6 +11124,7 @@ class InstructionsCompatibilityAnalysis {
1106911124
getWidenedType(S.getMainOp()->getType(), VL.size());
1107011125
switch (MainOpcode) {
1107111126
case Instruction::Add:
11127+
case Instruction::Sub:
1107211128
case Instruction::LShr:
1107311129
case Instruction::Shl:
1107411130
case Instruction::SDiv:
@@ -19686,8 +19742,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1968619742
V = ::propagateMetadata(I, E->Scalars);
1968719743
// Drop nuw flags for abs(sub(commutative), true).
1968819744
if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19689-
any_of(E->Scalars, [](Value *V) {
19690-
return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19745+
any_of(E->Scalars, [E](Value *V) {
19746+
return isa<PoisonValue>(V) ||
19747+
(E->hasCopyableElements() && E->isCopyableElement(V)) ||
19748+
isCommutative(cast<Instruction>(V));
1969119749
}))
1969219750
I->setHasNoUnsignedWrap(/*b=*/false);
1969319751
}
@@ -20091,9 +20149,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2009120149
// Drop nuw flags for abs(sub(commutative), true).
2009220150
if (auto *I = dyn_cast<Instruction>(Vec);
2009320151
I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20094-
any_of(E->Scalars, [](Value *V) {
20152+
any_of(E->Scalars, [E](Value *V) {
2009520153
if (isa<PoisonValue>(V))
2009620154
return false;
20155+
if (E->hasCopyableElements() && E->isCopyableElement(V))
20156+
return false;
2009720157
auto *IV = cast<Instruction>(V);
2009820158
return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
2009920159
}))

llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,8 @@
44
define i8 @test() {
55
; CHECK-LABEL: define i8 @test() {
66
; CHECK-NEXT: [[ENTRY:.*:]]
7-
; CHECK-NEXT: [[SUB_I_I79_PEEL_I:%.*]] = sub i16 0, 1
8-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> <i16 poison, i16 0>, i16 [[SUB_I_I79_PEEL_I]], i32 0
9-
; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
10-
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i32> zeroinitializer, [[TMP2]]
11-
; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i16>
12-
; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i16> [[TMP3]], [[TMP0]]
13-
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP4]], [[TMP0]]
14-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
15-
; CHECK-NEXT: [[CONV13_I89_PEEL_I:%.*]] = zext i1 [[TMP5]] to i8
7+
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <2 x i16> <i16 -1, i16 0>, <i16 -1, i16 0>
8+
; CHECK-NEXT: [[CONV13_I89_PEEL_I:%.*]] = zext i1 false to i8
169
; CHECK-NEXT: ret i8 [[CONV13_I89_PEEL_I]]
1710
;
1811
entry:

llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-parent-multi-copyables.ll

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,14 @@ define void @test() {
66
; CHECK-NEXT: [[BB:.*]]:
77
; CHECK-NEXT: br i1 false, label %[[BB1:.*]], label %[[BB6:.*]]
88
; CHECK: [[BB1]]:
9+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, i32 -1, i32 2
10+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 0, i32 3
11+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, [[TMP6]]
12+
; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, [[TMP6]]
13+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
914
; CHECK-NEXT: br label %[[BB6]]
1015
; CHECK: [[BB6]]:
11-
; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ <i32 0, i32 0, i32 poison, i32 0>, %[[BB]] ], [ <i32 0, i32 0, i32 -1, i32 -1>, %[[BB1]] ]
16+
; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ <i32 0, i32 0, i32 poison, i32 0>, %[[BB]] ], [ [[TMP4]], %[[BB1]] ]
1217
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
1318
; CHECK-NEXT: [[OR:%.*]] = or i32 [[TMP1]], 0
1419
; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) {
183183
; CHECK-LABEL: @addsub1(
184184
; CHECK-NEXT: entry:
185185
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
186-
; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 1, i32 0, i32 3>
186+
; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> [[TMP0]], <i32 1, i32 -1, i32 0, i32 -3>
187187
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
188188
; CHECK-NEXT: ret void
189189
;

0 commit comments

Comments
 (0)