Skip to content

Commit ca4ebf9

Browse files
[SLP]Support LShr as base for copyable elements
Added support for LShr instructions as base for copyable elements. Also, added simple analysis for best base instruction selection, if multiple candidates are available. Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: #153393
1 parent e2ae634 commit ca4ebf9

File tree

4 files changed

+65
-40
lines changed

4 files changed

+65
-40
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 59 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10564,35 +10564,41 @@ class InstructionsCompatibilityAnalysis {
1056410564
unsigned MainOpcode = 0;
1056510565
Instruction *MainOp = nullptr;
1056610566

10567+
/// Checks if the opcode is supported as the main opcode for copyable
10568+
/// elements.
10569+
static bool isSupportedOpcode(const unsigned Opcode) {
10570+
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10571+
}
10572+
1056710573
/// Identifies the best candidate value, which represents main opcode
1056810574
/// operation.
1056910575
/// Currently the best candidate is the Add instruction with the parent
1057010576
/// block with the highest DFS incoming number (block, that dominates other).
1057110577
void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
1057210578
BasicBlock *Parent = nullptr;
1057310579
// Checks if the instruction has supported opcode.
10574-
auto IsSupportedOpcode = [&](Instruction *I) {
10575-
return I && I->getOpcode() == Instruction::Add &&
10580+
auto IsSupportedInstruction = [&](Instruction *I) {
10581+
return I && isSupportedOpcode(I->getOpcode()) &&
1057610582
(!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
1057710583
};
1057810584
// Exclude operands instructions immediately to improve compile time, it
1057910585
// will be unable to schedule anyway.
1058010586
SmallDenseSet<Value *, 8> Operands;
10587+
SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
1058110588
for (Value *V : VL) {
1058210589
auto *I = dyn_cast<Instruction>(V);
1058310590
if (!I)
1058410591
continue;
1058510592
if (!DT.isReachableFromEntry(I->getParent()))
1058610593
continue;
10587-
if (!MainOp) {
10588-
MainOp = I;
10594+
if (Candidates.empty()) {
10595+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1058910596
Parent = I->getParent();
1059010597
Operands.insert(I->op_begin(), I->op_end());
1059110598
continue;
1059210599
}
1059310600
if (Parent == I->getParent()) {
10594-
if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
10595-
MainOp = I;
10601+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1059610602
Operands.insert(I->op_begin(), I->op_end());
1059710603
continue;
1059810604
}
@@ -10604,24 +10610,35 @@ class InstructionsCompatibilityAnalysis {
1060410610
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1060510611
"Different nodes should have different DFS numbers");
1060610612
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10607-
MainOp = I;
10613+
Candidates.clear();
10614+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1060810615
Parent = I->getParent();
1060910616
Operands.clear();
1061010617
Operands.insert(I->op_begin(), I->op_end());
1061110618
}
1061210619
}
10613-
if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
10614-
MainOp = nullptr;
10615-
return;
10620+
unsigned BestOpcodeNum = 0;
10621+
MainOp = nullptr;
10622+
for (const auto &P : Candidates) {
10623+
if (P.second.size() < BestOpcodeNum)
10624+
continue;
10625+
for (Instruction *I : P.second) {
10626+
if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10627+
MainOp = I;
10628+
BestOpcodeNum = P.second.size();
10629+
break;
10630+
}
10631+
}
1061610632
}
10617-
MainOpcode = MainOp->getOpcode();
10633+
if (MainOp)
10634+
MainOpcode = MainOp->getOpcode();
1061810635
}
1061910636

1062010637
/// Returns the idempotent value for the \p MainOp with the detected \p
1062110638
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
1062210639
/// the operand itself, since V or V == V.
1062310640
Value *selectBestIdempotentValue() const {
10624-
assert(MainOpcode == Instruction::Add && "Unsupported opcode");
10641+
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
1062510642
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
1062610643
!MainOp->isCommutative());
1062710644
}
@@ -10634,13 +10651,8 @@ class InstructionsCompatibilityAnalysis {
1063410651
return {V, V};
1063510652
if (!S.isCopyableElement(V))
1063610653
return convertTo(cast<Instruction>(V), S).second;
10637-
switch (MainOpcode) {
10638-
case Instruction::Add:
10639-
return {V, selectBestIdempotentValue()};
10640-
default:
10641-
break;
10642-
}
10643-
llvm_unreachable("Unsupported opcode");
10654+
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10655+
return {V, selectBestIdempotentValue()};
1064410656
}
1064510657

1064610658
/// Builds operands for the original instructions.
@@ -10853,6 +10865,21 @@ class InstructionsCompatibilityAnalysis {
1085310865
}
1085410866
if (!Res)
1085510867
return InstructionsState::invalid();
10868+
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10869+
InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10870+
InstructionCost VectorCost;
10871+
FixedVectorType *VecTy =
10872+
getWidenedType(S.getMainOp()->getType(), VL.size());
10873+
switch (MainOpcode) {
10874+
case Instruction::Add:
10875+
case Instruction::LShr:
10876+
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10877+
break;
10878+
default:
10879+
llvm_unreachable("Unexpected instruction.");
10880+
}
10881+
if (VectorCost > ScalarCost)
10882+
return InstructionsState::invalid();
1085610883
return S;
1085710884
}
1085810885
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21090,6 +21117,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2109021117
ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
2109121118
const auto *It = find(Op, CD->getInst());
2109221119
assert(It != Op.end() && "Lane not set");
21120+
SmallPtrSet<Instruction *, 4> Visited;
2109321121
do {
2109421122
int Lane = std::distance(Op.begin(), It);
2109521123
assert(Lane >= 0 && "Lane not set");
@@ -21111,13 +21139,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2111121139
(InsertInReadyList && UseSD->isReady()))
2111221140
WorkList.push_back(UseSD);
2111321141
}
21114-
} else if (ScheduleData *UseSD = getScheduleData(In)) {
21115-
CD->incDependencies();
21116-
if (!UseSD->isScheduled())
21117-
CD->incrementUnscheduledDeps(1);
21118-
if (!UseSD->hasValidDependencies() ||
21119-
(InsertInReadyList && UseSD->isReady()))
21120-
WorkList.push_back(UseSD);
21142+
} else if (Visited.insert(In).second) {
21143+
if (ScheduleData *UseSD = getScheduleData(In)) {
21144+
CD->incDependencies();
21145+
if (!UseSD->isScheduled())
21146+
CD->incrementUnscheduledDeps(1);
21147+
if (!UseSD->hasValidDependencies() ||
21148+
(InsertInReadyList && UseSD->isReady()))
21149+
WorkList.push_back(UseSD);
21150+
}
2112121151
}
2112221152
It = find(make_range(std::next(It), Op.end()), CD->getInst());
2112321153
} while (It != Op.end());
@@ -21875,9 +21905,11 @@ bool BoUpSLP::collectValuesToDemote(
2187521905
return all_of(E.Scalars, [&](Value *V) {
2187621906
if (isa<PoisonValue>(V))
2187721907
return true;
21908+
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21909+
if (E.isCopyableElement(V))
21910+
return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
2187821911
auto *I = cast<Instruction>(V);
2187921912
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21880-
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
2188121913
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
2188221914
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
2188321915
SimplifyQuery(*DL));

llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,8 @@ define i32 @test(ptr %c) {
88
; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
99
; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
1010
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
11-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
12-
; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
13-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
14-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
15-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
11+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
12+
; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
1613
; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
1714
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
1815
; CHECK-NEXT: ret i32 0

llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105-
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
104+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105+
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
108106
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109107
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110108
; CHECK-NEXT: ret <4 x float> [[TMP5]]

llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105-
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
104+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105+
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
108106
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109107
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110108
; CHECK-NEXT: ret <4 x float> [[TMP5]]

0 commit comments

Comments
 (0)