@@ -4365,14 +4365,18 @@ class BoUpSLP {
43654365 } else {
43664366 // Build a map for gathered scalars to the nodes where they are used.
43674367 bool AllConstsOrCasts = true;
4368- for (Value *V : VL)
4368+ for (Value *V : VL) {
4369+ if (S && S.areInstructionsWithCopyableElements() &&
4370+ S.isCopyableElement(V))
4371+ Last->addCopyableElement(V);
43694372 if (!isConstant(V)) {
43704373 auto *I = dyn_cast<CastInst>(V);
43714374 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
43724375 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
43734376 !UserTreeIdx.UserTE->isGather())
43744377 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
43754378 }
4379+ }
43764380 if (AllConstsOrCasts)
43774381 CastMaxMinBWSizes =
43784382 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
@@ -10564,35 +10568,41 @@ class InstructionsCompatibilityAnalysis {
1056410568 unsigned MainOpcode = 0;
1056510569 Instruction *MainOp = nullptr;
1056610570
10571+ /// Checks if the opcode is supported as the main opcode for copyable
10572+ /// elements.
10573+ static bool isSupportedOpcode(const unsigned Opcode) {
10574+ return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10575+ }
10576+
1056710577 /// Identifies the best candidate value, which represents main opcode
1056810578 /// operation.
1056910579 /// Currently the best candidate is the Add instruction with the parent
1057010580 /// block with the highest DFS incoming number (block, that dominates other).
1057110581 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
1057210582 BasicBlock *Parent = nullptr;
1057310583 // Checks if the instruction has supported opcode.
10574- auto IsSupportedOpcode = [&](Instruction *I) {
10575- return I && I->getOpcode() == Instruction::Add &&
10584+ auto IsSupportedInstruction = [&](Instruction *I) {
10585+ return I && isSupportedOpcode( I->getOpcode()) &&
1057610586 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
1057710587 };
1057810588 // Exclude operands instructions immediately to improve compile time, it
1057910589 // will be unable to schedule anyway.
1058010590 SmallDenseSet<Value *, 8> Operands;
10591+ SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
1058110592 for (Value *V : VL) {
1058210593 auto *I = dyn_cast<Instruction>(V);
1058310594 if (!I)
1058410595 continue;
1058510596 if (!DT.isReachableFromEntry(I->getParent()))
1058610597 continue;
10587- if (!MainOp ) {
10588- MainOp = I ;
10598+ if (Candidates.empty() ) {
10599+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I) ;
1058910600 Parent = I->getParent();
1059010601 Operands.insert(I->op_begin(), I->op_end());
1059110602 continue;
1059210603 }
1059310604 if (Parent == I->getParent()) {
10594- if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
10595- MainOp = I;
10605+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1059610606 Operands.insert(I->op_begin(), I->op_end());
1059710607 continue;
1059810608 }
@@ -10604,24 +10614,35 @@ class InstructionsCompatibilityAnalysis {
1060410614 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1060510615 "Different nodes should have different DFS numbers");
1060610616 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10607- MainOp = I;
10617+ Candidates.clear();
10618+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
1060810619 Parent = I->getParent();
1060910620 Operands.clear();
1061010621 Operands.insert(I->op_begin(), I->op_end());
1061110622 }
1061210623 }
10613- if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
10614- MainOp = nullptr;
10615- return;
10624+ unsigned BestOpcodeNum = 0;
10625+ MainOp = nullptr;
10626+ for (const auto &P : Candidates) {
10627+ if (P.second.size() < BestOpcodeNum)
10628+ continue;
10629+ for (Instruction *I : P.second) {
10630+ if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10631+ MainOp = I;
10632+ BestOpcodeNum = P.second.size();
10633+ break;
10634+ }
10635+ }
1061610636 }
10617- MainOpcode = MainOp->getOpcode();
10637+ if (MainOp)
10638+ MainOpcode = MainOp->getOpcode();
1061810639 }
1061910640
1062010641 /// Returns the idempotent value for the \p MainOp with the detected \p
1062110642 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
1062210643 /// the operand itself, since V or V == V.
1062310644 Value *selectBestIdempotentValue() const {
10624- assert(MainOpcode == Instruction::Add && "Unsupported opcode");
10645+ assert(isSupportedOpcode( MainOpcode) && "Unsupported opcode");
1062510646 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
1062610647 !MainOp->isCommutative());
1062710648 }
@@ -10634,13 +10655,8 @@ class InstructionsCompatibilityAnalysis {
1063410655 return {V, V};
1063510656 if (!S.isCopyableElement(V))
1063610657 return convertTo(cast<Instruction>(V), S).second;
10637- switch (MainOpcode) {
10638- case Instruction::Add:
10639- return {V, selectBestIdempotentValue()};
10640- default:
10641- break;
10642- }
10643- llvm_unreachable("Unsupported opcode");
10658+ assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10659+ return {V, selectBestIdempotentValue()};
1064410660 }
1064510661
1064610662 /// Builds operands for the original instructions.
@@ -10853,6 +10869,21 @@ class InstructionsCompatibilityAnalysis {
1085310869 }
1085410870 if (!Res)
1085510871 return InstructionsState::invalid();
10872+ constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10873+ InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10874+ InstructionCost VectorCost;
10875+ FixedVectorType *VecTy =
10876+ getWidenedType(S.getMainOp()->getType(), VL.size());
10877+ switch (MainOpcode) {
10878+ case Instruction::Add:
10879+ case Instruction::LShr:
10880+ VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10881+ break;
10882+ default:
10883+ llvm_unreachable("Unexpected instruction.");
10884+ }
10885+ if (VectorCost > ScalarCost)
10886+ return InstructionsState::invalid();
1085610887 return S;
1085710888 }
1085810889 assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21090,6 +21121,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2109021121 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
2109121122 const auto *It = find(Op, CD->getInst());
2109221123 assert(It != Op.end() && "Lane not set");
21124+ SmallPtrSet<Instruction *, 4> Visited;
2109321125 do {
2109421126 int Lane = std::distance(Op.begin(), It);
2109521127 assert(Lane >= 0 && "Lane not set");
@@ -21111,13 +21143,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2111121143 (InsertInReadyList && UseSD->isReady()))
2111221144 WorkList.push_back(UseSD);
2111321145 }
21114- } else if (ScheduleData *UseSD = getScheduleData(In)) {
21115- CD->incDependencies();
21116- if (!UseSD->isScheduled())
21117- CD->incrementUnscheduledDeps(1);
21118- if (!UseSD->hasValidDependencies() ||
21119- (InsertInReadyList && UseSD->isReady()))
21120- WorkList.push_back(UseSD);
21146+ } else if (Visited.insert(In).second) {
21147+ if (ScheduleData *UseSD = getScheduleData(In)) {
21148+ CD->incDependencies();
21149+ if (!UseSD->isScheduled())
21150+ CD->incrementUnscheduledDeps(1);
21151+ if (!UseSD->hasValidDependencies() ||
21152+ (InsertInReadyList && UseSD->isReady()))
21153+ WorkList.push_back(UseSD);
21154+ }
2112121155 }
2112221156 It = find(make_range(std::next(It), Op.end()), CD->getInst());
2112321157 } while (It != Op.end());
@@ -21875,9 +21909,11 @@ bool BoUpSLP::collectValuesToDemote(
2187521909 return all_of(E.Scalars, [&](Value *V) {
2187621910 if (isa<PoisonValue>(V))
2187721911 return true;
21912+ APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21913+ if (E.isCopyableElement(V))
21914+ return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
2187821915 auto *I = cast<Instruction>(V);
2187921916 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21880- APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
2188121917 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
2188221918 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
2188321919 SimplifyQuery(*DL));
0 commit comments