Skip to content

Commit 209627b

Browse files
alexey-bataevronlieb
authored andcommitted
[SLP]Support LShr as base for copyable elements
Added support for LShr instructions as base for copyable elements. Also, added simple analysis for best base instruction selection, if multiple candidates are available. Fixed scheduling after cancellation Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: llvm#153393
1 parent 731ea1d commit 209627b

File tree

4 files changed

+68
-40
lines changed

4 files changed

+68
-40
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4397,14 +4397,18 @@ class BoUpSLP {
43974397
} else {
43984398
// Build a map for gathered scalars to the nodes where they are used.
43994399
bool AllConstsOrCasts = true;
4400-
for (Value *V : VL)
4400+
for (Value *V : VL) {
4401+
if (S && S.areInstructionsWithCopyableElements() &&
4402+
S.isCopyableElement(V))
4403+
Last->addCopyableElement(V);
44014404
if (!isConstant(V)) {
44024405
auto *I = dyn_cast<CastInst>(V);
44034406
AllConstsOrCasts &= I && I->getType()->isIntegerTy();
44044407
if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
44054408
!UserTreeIdx.UserTE->isGather())
44064409
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
44074410
}
4411+
}
44084412
if (AllConstsOrCasts)
44094413
CastMaxMinBWSizes =
44104414
std::make_pair(std::numeric_limits<unsigned>::max(), 1);
@@ -10613,35 +10617,41 @@ class InstructionsCompatibilityAnalysis {
1061310617
unsigned MainOpcode = 0;
1061410618
Instruction *MainOp = nullptr;
1061510619

10620+
/// Checks if the opcode is supported as the main opcode for copyable
10621+
/// elements.
10622+
static bool isSupportedOpcode(const unsigned Opcode) {
10623+
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10624+
}
10625+
1061610626
/// Identifies the best candidate value, which represents main opcode
1061710627
/// operation.
1061810628
/// Currently the best candidate is the Add instruction with the parent
1061910629
/// block with the highest DFS incoming number (block, that dominates other).
1062010630
void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
1062110631
BasicBlock *Parent = nullptr;
1062210632
// Checks if the instruction has supported opcode.
10623-
auto IsSupportedOpcode = [&](Instruction *I) {
10624-
return I && I->getOpcode() == Instruction::Add &&
10633+
auto IsSupportedInstruction = [&](Instruction *I) {
10634+
return I && isSupportedOpcode(I->getOpcode()) &&
1062510635
(!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
1062610636
};
1062710637
// Exclude operands instructions immediately to improve compile time, it
1062810638
// will be unable to schedule anyway.
1062910639
SmallDenseSet<Value *, 8> Operands;
10640+
SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
1063010641
for (Value *V : VL) {
1063110642
auto *I = dyn_cast<Instruction>(V);
1063210643
if (!I)
1063310644
continue;
1063410645
if (!DT.isReachableFromEntry(I->getParent()))
1063510646
continue;
10636-
if (!MainOp) {
10637-
MainOp = I;
10647+
if (Candidates.empty()) {
10648+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1063810649
Parent = I->getParent();
1063910650
Operands.insert(I->op_begin(), I->op_end());
1064010651
continue;
1064110652
}
1064210653
if (Parent == I->getParent()) {
10643-
if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
10644-
MainOp = I;
10654+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1064510655
Operands.insert(I->op_begin(), I->op_end());
1064610656
continue;
1064710657
}
@@ -10653,15 +10663,25 @@ class InstructionsCompatibilityAnalysis {
1065310663
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1065410664
"Different nodes should have different DFS numbers");
1065510665
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10656-
MainOp = I;
10666+
Candidates.clear();
10667+
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1065710668
Parent = I->getParent();
1065810669
Operands.clear();
1065910670
Operands.insert(I->op_begin(), I->op_end());
1066010671
}
1066110672
}
10662-
if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
10663-
MainOp = nullptr;
10664-
return;
10673+
unsigned BestOpcodeNum = 0;
10674+
MainOp = nullptr;
10675+
for (const auto &P : Candidates) {
10676+
if (P.second.size() < BestOpcodeNum)
10677+
continue;
10678+
for (Instruction *I : P.second) {
10679+
if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10680+
MainOp = I;
10681+
BestOpcodeNum = P.second.size();
10682+
break;
10683+
}
10684+
}
1066510685
}
1066610686
if (MainOp) {
1066710687
// Do not match, if any copyable is a terminator from the same block as
@@ -10682,7 +10702,7 @@ class InstructionsCompatibilityAnalysis {
1068210702
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
1068310703
/// the operand itself, since V or V == V.
1068410704
Value *selectBestIdempotentValue() const {
10685-
assert(MainOpcode == Instruction::Add && "Unsupported opcode");
10705+
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
1068610706
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
1068710707
!MainOp->isCommutative());
1068810708
}
@@ -10695,13 +10715,8 @@ class InstructionsCompatibilityAnalysis {
1069510715
return {V, V};
1069610716
if (!S.isCopyableElement(V))
1069710717
return convertTo(cast<Instruction>(V), S).second;
10698-
switch (MainOpcode) {
10699-
case Instruction::Add:
10700-
return {V, selectBestIdempotentValue()};
10701-
default:
10702-
break;
10703-
}
10704-
llvm_unreachable("Unsupported opcode");
10718+
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10719+
return {V, selectBestIdempotentValue()};
1070510720
}
1070610721

1070710722
/// Builds operands for the original instructions.
@@ -10914,6 +10929,21 @@ class InstructionsCompatibilityAnalysis {
1091410929
}
1091510930
if (!Res)
1091610931
return InstructionsState::invalid();
10932+
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10933+
InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10934+
InstructionCost VectorCost;
10935+
FixedVectorType *VecTy =
10936+
getWidenedType(S.getMainOp()->getType(), VL.size());
10937+
switch (MainOpcode) {
10938+
case Instruction::Add:
10939+
case Instruction::LShr:
10940+
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10941+
break;
10942+
default:
10943+
llvm_unreachable("Unexpected instruction.");
10944+
}
10945+
if (VectorCost > ScalarCost)
10946+
return InstructionsState::invalid();
1091710947
return S;
1091810948
}
1091910949
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21200,6 +21230,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2120021230
ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
2120121231
const auto *It = find(Op, CD->getInst());
2120221232
assert(It != Op.end() && "Lane not set");
21233+
SmallPtrSet<Instruction *, 4> Visited;
2120321234
do {
2120421235
int Lane = std::distance(Op.begin(), It);
2120521236
assert(Lane >= 0 && "Lane not set");
@@ -21221,13 +21252,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2122121252
(InsertInReadyList && UseSD->isReady()))
2122221253
WorkList.push_back(UseSD);
2122321254
}
21224-
} else if (ScheduleData *UseSD = getScheduleData(In)) {
21225-
CD->incDependencies();
21226-
if (!UseSD->isScheduled())
21227-
CD->incrementUnscheduledDeps(1);
21228-
if (!UseSD->hasValidDependencies() ||
21229-
(InsertInReadyList && UseSD->isReady()))
21230-
WorkList.push_back(UseSD);
21255+
} else if (Visited.insert(In).second) {
21256+
if (ScheduleData *UseSD = getScheduleData(In)) {
21257+
CD->incDependencies();
21258+
if (!UseSD->isScheduled())
21259+
CD->incrementUnscheduledDeps(1);
21260+
if (!UseSD->hasValidDependencies() ||
21261+
(InsertInReadyList && UseSD->isReady()))
21262+
WorkList.push_back(UseSD);
21263+
}
2123121264
}
2123221265
It = find(make_range(std::next(It), Op.end()), CD->getInst());
2123321266
} while (It != Op.end());
@@ -21989,9 +22022,11 @@ bool BoUpSLP::collectValuesToDemote(
2198922022
return all_of(E.Scalars, [&](Value *V) {
2199022023
if (isa<PoisonValue>(V))
2199122024
return true;
22025+
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22026+
if (E.isCopyableElement(V))
22027+
return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
2199222028
auto *I = cast<Instruction>(V);
2199322029
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21994-
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
2199522030
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
2199622031
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
2199722032
SimplifyQuery(*DL));

llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,8 @@ define i32 @test(ptr %c) {
88
; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
99
; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
1010
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
11-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
12-
; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
13-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
14-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
15-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
11+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
12+
; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
1613
; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
1714
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
1815
; CHECK-NEXT: ret i32 0

llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105-
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
104+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105+
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
108106
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109107
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110108
; CHECK-NEXT: ret <4 x float> [[TMP5]]

llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105-
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
104+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105+
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
108106
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109107
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110108
; CHECK-NEXT: ret <4 x float> [[TMP5]]

0 commit comments

Comments
 (0)