@@ -4397,14 +4397,18 @@ class BoUpSLP {
43974397 } else {
43984398 // Build a map for gathered scalars to the nodes where they are used.
43994399 bool AllConstsOrCasts = true;
4400- for (Value *V : VL)
4400+ for (Value *V : VL) {
4401+ if (S && S.areInstructionsWithCopyableElements() &&
4402+ S.isCopyableElement(V))
4403+ Last->addCopyableElement(V);
44014404 if (!isConstant(V)) {
44024405 auto *I = dyn_cast<CastInst>(V);
44034406 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
44044407 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
44054408 !UserTreeIdx.UserTE->isGather())
44064409 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
44074410 }
4411+ }
44084412 if (AllConstsOrCasts)
44094413 CastMaxMinBWSizes =
44104414 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
@@ -10613,35 +10617,41 @@ class InstructionsCompatibilityAnalysis {
1061310617 unsigned MainOpcode = 0;
1061410618 Instruction *MainOp = nullptr;
1061510619
10620+ /// Checks if the opcode is supported as the main opcode for copyable
10621+ /// elements.
10622+ static bool isSupportedOpcode(const unsigned Opcode) {
10623+ return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10624+ }
10625+
1061610626 /// Identifies the best candidate value, which represents main opcode
1061710627 /// operation.
1061810628 /// Currently the best candidate is the Add instruction with the parent
1061910629 /// block with the highest DFS incoming number (block, that dominates other).
1062010630 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
1062110631 BasicBlock *Parent = nullptr;
1062210632 // Checks if the instruction has supported opcode.
10623- auto IsSupportedOpcode = [&](Instruction *I) {
10624- return I && I->getOpcode() == Instruction::Add &&
10633+ auto IsSupportedInstruction = [&](Instruction *I) {
10634+ return I && isSupportedOpcode( I->getOpcode()) &&
1062510635 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
1062610636 };
1062710637 // Exclude operands instructions immediately to improve compile time, it
1062810638 // will be unable to schedule anyway.
1062910639 SmallDenseSet<Value *, 8> Operands;
10640+ SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
1063010641 for (Value *V : VL) {
1063110642 auto *I = dyn_cast<Instruction>(V);
1063210643 if (!I)
1063310644 continue;
1063410645 if (!DT.isReachableFromEntry(I->getParent()))
1063510646 continue;
10636- if (!MainOp ) {
10637- MainOp = I ;
10647+ if (Candidates.empty() ) {
10648+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I) ;
1063810649 Parent = I->getParent();
1063910650 Operands.insert(I->op_begin(), I->op_end());
1064010651 continue;
1064110652 }
1064210653 if (Parent == I->getParent()) {
10643- if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
10644- MainOp = I;
10654+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1064510655 Operands.insert(I->op_begin(), I->op_end());
1064610656 continue;
1064710657 }
@@ -10653,15 +10663,25 @@ class InstructionsCompatibilityAnalysis {
1065310663 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1065410664 "Different nodes should have different DFS numbers");
1065510665 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10656- MainOp = I;
10666+ Candidates.clear();
10667+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1065710668 Parent = I->getParent();
1065810669 Operands.clear();
1065910670 Operands.insert(I->op_begin(), I->op_end());
1066010671 }
1066110672 }
10662- if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
10663- MainOp = nullptr;
10664- return;
10673+ unsigned BestOpcodeNum = 0;
10674+ MainOp = nullptr;
10675+ for (const auto &P : Candidates) {
10676+ if (P.second.size() < BestOpcodeNum)
10677+ continue;
10678+ for (Instruction *I : P.second) {
10679+ if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10680+ MainOp = I;
10681+ BestOpcodeNum = P.second.size();
10682+ break;
10683+ }
10684+ }
1066510685 }
1066610686 if (MainOp) {
1066710687 // Do not match, if any copyable is a terminator from the same block as
@@ -10682,7 +10702,7 @@ class InstructionsCompatibilityAnalysis {
1068210702 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
1068310703 /// the operand itself, since V or V == V.
1068410704 Value *selectBestIdempotentValue() const {
10685- assert(MainOpcode == Instruction::Add && "Unsupported opcode");
10705+ assert(isSupportedOpcode( MainOpcode) && "Unsupported opcode");
1068610706 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
1068710707 !MainOp->isCommutative());
1068810708 }
@@ -10695,13 +10715,8 @@ class InstructionsCompatibilityAnalysis {
1069510715 return {V, V};
1069610716 if (!S.isCopyableElement(V))
1069710717 return convertTo(cast<Instruction>(V), S).second;
10698- switch (MainOpcode) {
10699- case Instruction::Add:
10700- return {V, selectBestIdempotentValue()};
10701- default:
10702- break;
10703- }
10704- llvm_unreachable("Unsupported opcode");
10718+ assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10719+ return {V, selectBestIdempotentValue()};
1070510720 }
1070610721
1070710722 /// Builds operands for the original instructions.
@@ -10914,6 +10929,21 @@ class InstructionsCompatibilityAnalysis {
1091410929 }
1091510930 if (!Res)
1091610931 return InstructionsState::invalid();
10932+ constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10933+ InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10934+ InstructionCost VectorCost;
10935+ FixedVectorType *VecTy =
10936+ getWidenedType(S.getMainOp()->getType(), VL.size());
10937+ switch (MainOpcode) {
10938+ case Instruction::Add:
10939+ case Instruction::LShr:
10940+ VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10941+ break;
10942+ default:
10943+ llvm_unreachable("Unexpected instruction.");
10944+ }
10945+ if (VectorCost > ScalarCost)
10946+ return InstructionsState::invalid();
1091710947 return S;
1091810948 }
1091910949 assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21200,6 +21230,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2120021230 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
2120121231 const auto *It = find(Op, CD->getInst());
2120221232 assert(It != Op.end() && "Lane not set");
21233+ SmallPtrSet<Instruction *, 4> Visited;
2120321234 do {
2120421235 int Lane = std::distance(Op.begin(), It);
2120521236 assert(Lane >= 0 && "Lane not set");
@@ -21221,13 +21252,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2122121252 (InsertInReadyList && UseSD->isReady()))
2122221253 WorkList.push_back(UseSD);
2122321254 }
21224- } else if (ScheduleData *UseSD = getScheduleData(In)) {
21225- CD->incDependencies();
21226- if (!UseSD->isScheduled())
21227- CD->incrementUnscheduledDeps(1);
21228- if (!UseSD->hasValidDependencies() ||
21229- (InsertInReadyList && UseSD->isReady()))
21230- WorkList.push_back(UseSD);
21255+ } else if (Visited.insert(In).second) {
21256+ if (ScheduleData *UseSD = getScheduleData(In)) {
21257+ CD->incDependencies();
21258+ if (!UseSD->isScheduled())
21259+ CD->incrementUnscheduledDeps(1);
21260+ if (!UseSD->hasValidDependencies() ||
21261+ (InsertInReadyList && UseSD->isReady()))
21262+ WorkList.push_back(UseSD);
21263+ }
2123121264 }
2123221265 It = find(make_range(std::next(It), Op.end()), CD->getInst());
2123321266 } while (It != Op.end());
@@ -21989,9 +22022,11 @@ bool BoUpSLP::collectValuesToDemote(
2198922022 return all_of(E.Scalars, [&](Value *V) {
2199022023 if (isa<PoisonValue>(V))
2199122024 return true;
22025+ APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22026+ if (E.isCopyableElement(V))
22027+ return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
2199222028 auto *I = cast<Instruction>(V);
2199322029 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21994- APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
2199522030 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
2199622031 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
2199722032 SimplifyQuery(*DL));
0 commit comments