Skip to content

Commit bf2f241

Browse files
committed
[SLP]Support LShr as base for copyable elements
Added support for LShr instructions as base for copyable elements. Also, added simple analysis for best base instruction selection, if multiple candidates are available. Fixed scheduling after cancellation Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: #153393
1 parent 3f797a8 commit bf2f241

File tree

4 files changed

+70
-41
lines changed

4 files changed

+70
-41
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 64 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4365,14 +4365,18 @@ class BoUpSLP {
43654365
} else {
43664366
// Build a map for gathered scalars to the nodes where they are used.
43674367
bool AllConstsOrCasts = true;
4368-
for (Value *V : VL)
4368+
for (Value *V : VL) {
4369+
if (S && S.areInstructionsWithCopyableElements() &&
4370+
S.isCopyableElement(V))
4371+
Last->addCopyableElement(V);
43694372
if (!isConstant(V)) {
43704373
auto *I = dyn_cast<CastInst>(V);
43714374
AllConstsOrCasts &= I && I->getType()->isIntegerTy();
43724375
if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
43734376
!UserTreeIdx.UserTE->isGather())
43744377
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
43754378
}
4379+
}
43764380
if (AllConstsOrCasts)
43774381
CastMaxMinBWSizes =
43784382
std::make_pair(std::numeric_limits<unsigned>::max(), 1);
@@ -10564,35 +10568,41 @@ class InstructionsCompatibilityAnalysis {
1056410568
unsigned MainOpcode = 0;
1056510569
Instruction *MainOp = nullptr;
1056610570

10571+
/// Checks if the opcode is supported as the main opcode for copyable
10572+
/// elements.
10573+
static bool isSupportedOpcode(const unsigned Opcode) {
10574+
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10575+
}
10576+
1056710577
/// Identifies the best candidate value, which represents main opcode
1056810578
/// operation.
1056910579
/// Currently the best candidate is the Add instruction with the parent
1057010580
/// block with the highest DFS incoming number (block, that dominates other).
1057110581
void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
1057210582
BasicBlock *Parent = nullptr;
1057310583
// Checks if the instruction has supported opcode.
10574-
auto IsSupportedOpcode = [&](Instruction *I) {
10575-
return I && I->getOpcode() == Instruction::Add &&
10584+
auto IsSupportedInstruction = [&](Instruction *I) {
10585+
return I && isSupportedOpcode(I->getOpcode()) &&
1057610586
(!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
1057710587
};
1057810588
// Exclude operands instructions immediately to improve compile time, it
1057910589
// will be unable to schedule anyway.
1058010590
SmallDenseSet<Value *, 8> Operands;
10591+
SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
1058110592
for (Value *V : VL) {
1058210593
auto *I = dyn_cast<Instruction>(V);
1058310594
if (!I)
1058410595
continue;
1058510596
if (!DT.isReachableFromEntry(I->getParent()))
1058610597
continue;
10587-
if (!MainOp) {
10588-
MainOp = I;
10598+
if (Candidates.empty()) {
10599+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1058910600
Parent = I->getParent();
1059010601
Operands.insert(I->op_begin(), I->op_end());
1059110602
continue;
1059210603
}
1059310604
if (Parent == I->getParent()) {
10594-
if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
10595-
MainOp = I;
10605+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1059610606
Operands.insert(I->op_begin(), I->op_end());
1059710607
continue;
1059810608
}
@@ -10604,24 +10614,35 @@ class InstructionsCompatibilityAnalysis {
1060410614
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1060510615
"Different nodes should have different DFS numbers");
1060610616
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10607-
MainOp = I;
10617+
Candidates.clear();
10618+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1060810619
Parent = I->getParent();
1060910620
Operands.clear();
1061010621
Operands.insert(I->op_begin(), I->op_end());
1061110622
}
1061210623
}
10613-
if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
10614-
MainOp = nullptr;
10615-
return;
10624+
unsigned BestOpcodeNum = 0;
10625+
MainOp = nullptr;
10626+
for (const auto &P : Candidates) {
10627+
if (P.second.size() < BestOpcodeNum)
10628+
continue;
10629+
for (Instruction *I : P.second) {
10630+
if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10631+
MainOp = I;
10632+
BestOpcodeNum = P.second.size();
10633+
break;
10634+
}
10635+
}
1061610636
}
10617-
MainOpcode = MainOp->getOpcode();
10637+
if (MainOp)
10638+
MainOpcode = MainOp->getOpcode();
1061810639
}
1061910640

1062010641
/// Returns the idempotent value for the \p MainOp with the detected \p
1062110642
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
1062210643
/// the operand itself, since V or V == V.
1062310644
Value *selectBestIdempotentValue() const {
10624-
assert(MainOpcode == Instruction::Add && "Unsupported opcode");
10645+
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
1062510646
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
1062610647
!MainOp->isCommutative());
1062710648
}
@@ -10634,13 +10655,8 @@ class InstructionsCompatibilityAnalysis {
1063410655
return {V, V};
1063510656
if (!S.isCopyableElement(V))
1063610657
return convertTo(cast<Instruction>(V), S).second;
10637-
switch (MainOpcode) {
10638-
case Instruction::Add:
10639-
return {V, selectBestIdempotentValue()};
10640-
default:
10641-
break;
10642-
}
10643-
llvm_unreachable("Unsupported opcode");
10658+
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10659+
return {V, selectBestIdempotentValue()};
1064410660
}
1064510661

1064610662
/// Builds operands for the original instructions.
@@ -10853,6 +10869,21 @@ class InstructionsCompatibilityAnalysis {
1085310869
}
1085410870
if (!Res)
1085510871
return InstructionsState::invalid();
10872+
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10873+
InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10874+
InstructionCost VectorCost;
10875+
FixedVectorType *VecTy =
10876+
getWidenedType(S.getMainOp()->getType(), VL.size());
10877+
switch (MainOpcode) {
10878+
case Instruction::Add:
10879+
case Instruction::LShr:
10880+
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10881+
break;
10882+
default:
10883+
llvm_unreachable("Unexpected instruction.");
10884+
}
10885+
if (VectorCost > ScalarCost)
10886+
return InstructionsState::invalid();
1085610887
return S;
1085710888
}
1085810889
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21090,6 +21121,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2109021121
ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
2109121122
const auto *It = find(Op, CD->getInst());
2109221123
assert(It != Op.end() && "Lane not set");
21124+
SmallPtrSet<Instruction *, 4> Visited;
2109321125
do {
2109421126
int Lane = std::distance(Op.begin(), It);
2109521127
assert(Lane >= 0 && "Lane not set");
@@ -21111,13 +21143,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2111121143
(InsertInReadyList && UseSD->isReady()))
2111221144
WorkList.push_back(UseSD);
2111321145
}
21114-
} else if (ScheduleData *UseSD = getScheduleData(In)) {
21115-
CD->incDependencies();
21116-
if (!UseSD->isScheduled())
21117-
CD->incrementUnscheduledDeps(1);
21118-
if (!UseSD->hasValidDependencies() ||
21119-
(InsertInReadyList && UseSD->isReady()))
21120-
WorkList.push_back(UseSD);
21146+
} else if (Visited.insert(In).second) {
21147+
if (ScheduleData *UseSD = getScheduleData(In)) {
21148+
CD->incDependencies();
21149+
if (!UseSD->isScheduled())
21150+
CD->incrementUnscheduledDeps(1);
21151+
if (!UseSD->hasValidDependencies() ||
21152+
(InsertInReadyList && UseSD->isReady()))
21153+
WorkList.push_back(UseSD);
21154+
}
2112121155
}
2112221156
It = find(make_range(std::next(It), Op.end()), CD->getInst());
2112321157
} while (It != Op.end());
@@ -21875,9 +21909,11 @@ bool BoUpSLP::collectValuesToDemote(
2187521909
return all_of(E.Scalars, [&](Value *V) {
2187621910
if (isa<PoisonValue>(V))
2187721911
return true;
21912+
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21913+
if (E.isCopyableElement(V))
21914+
return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
2187821915
auto *I = cast<Instruction>(V);
2187921916
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21880-
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
2188121917
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
2188221918
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
2188321919
SimplifyQuery(*DL));

llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,8 @@ define i32 @test(ptr %c) {
88
; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
99
; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
1010
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
11-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
12-
; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
13-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
14-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
15-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
11+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
12+
; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
1613
; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
1714
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
1815
; CHECK-NEXT: ret i32 0

llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105-
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
104+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105+
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
108106
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109107
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110108
; CHECK-NEXT: ret <4 x float> [[TMP5]]

llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105-
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
104+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105+
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
108106
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109107
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110108
; CHECK-NEXT: ret <4 x float> [[TMP5]]

0 commit comments

Comments
 (0)