@@ -12448,109 +12448,201 @@ InstructionCost BoUpSLP::getSpillCost() {
1244812448 // (for example, if spills and fills are required).
1244912449 InstructionCost Cost = 0;
1245012450
12451- SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12452- const TreeEntry *Prev = nullptr;
12453-
12454- // The entries in VectorizableTree are not necessarily ordered by their
12455- // position in basic blocks. Collect them and order them by dominance so later
12456- // instructions are guaranteed to be visited first. For instructions in
12457- // different basic blocks, we only scan to the beginning of the block, so
12458- // their order does not matter, as long as all instructions in a basic block
12459- // are grouped together. Using dominance ensures a deterministic order.
12460- SmallVector<TreeEntry *, 16> OrderedEntries;
12461- for (const auto &TEPtr : VectorizableTree) {
12462- if (TEPtr->isGather())
12463- continue;
12464- OrderedEntries.push_back(TEPtr.get());
12465- }
12466- llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12467- const TreeEntry *TB) {
12468- Instruction &A = getLastInstructionInBundle(TA);
12469- Instruction &B = getLastInstructionInBundle(TB);
12470- auto *NodeA = DT->getNode(A.getParent());
12471- auto *NodeB = DT->getNode(B.getParent());
12472- assert(NodeA && "Should only process reachable instructions");
12473- assert(NodeB && "Should only process reachable instructions");
12474- assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12475- "Different nodes should have different DFS numbers");
12476- if (NodeA != NodeB)
12477- return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12478- return B.comesBefore(&A);
12479- });
12480-
12481- for (const TreeEntry *TE : OrderedEntries) {
12482- if (!Prev) {
12483- Prev = TE;
12484- continue;
12485- }
12451+ const TreeEntry *Root = VectorizableTree.front().get();
12452+ if (Root->isGather())
12453+ return Cost;
1248612454
12487- LiveEntries.erase(Prev);
12488- for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12489- const TreeEntry *Op = getVectorizedOperand(Prev, I);
12490- if (!Op)
12491- continue;
12492- assert(!Op->isGather() && "Expected vectorized operand.");
12493- LiveEntries.insert(Op);
12455+ SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12456+ EntriesToOperands;
12457+ SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12458+ SmallPtrSet<const Instruction *, 8> LastInstructions;
12459+ for (const auto &TEPtr : VectorizableTree) {
12460+ if (!TEPtr->isGather()) {
12461+ Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12462+ EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12463+ LastInstructions.insert(LastInst);
1249412464 }
12465+ if (TEPtr->UserTreeIndex)
12466+ EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12467+ }
1249512468
12496- LLVM_DEBUG({
12497- dbgs() << "SLP: #LV: " << LiveEntries.size();
12498- for (auto *X : LiveEntries)
12499- X->dump();
12500- dbgs() << ", Looking at ";
12501- TE->dump();
12502- });
12503-
12504- // Now find the sequence of instructions between PrevInst and Inst.
12505- unsigned NumCalls = 0;
12506- const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12507- BasicBlock::const_reverse_iterator
12508- InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12509- PrevInstIt = PrevInst->getIterator().getReverse();
12510- while (InstIt != PrevInstIt) {
12511- if (PrevInstIt == PrevInst->getParent()->rend()) {
12512- PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12513- continue;
12514- }
12515-
12516- auto NoCallIntrinsic = [this](const Instruction *I) {
12517- const auto *II = dyn_cast<IntrinsicInst>(I);
12518- if (!II)
12519- return false;
12520- if (II->isAssumeLikeIntrinsic())
12521- return true;
12522- IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12523- InstructionCost IntrCost =
12524- TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12525- InstructionCost CallCost =
12526- TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
12527- TTI::TCK_RecipThroughput);
12528- return IntrCost < CallCost;
12529- };
12469+ auto NoCallIntrinsic = [this](const Instruction *I) {
12470+ const auto *II = dyn_cast<IntrinsicInst>(I);
12471+ if (!II)
12472+ return false;
12473+ if (II->isAssumeLikeIntrinsic())
12474+ return true;
12475+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12476+ InstructionCost IntrCost =
12477+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12478+ InstructionCost CallCost = TTI->getCallInstrCost(
12479+ nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12480+ return IntrCost < CallCost;
12481+ };
1253012482
12483+ SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12484+ CheckedInstructions;
12485+ unsigned Budget = 0;
12486+ const unsigned BudgetLimit =
12487+ ScheduleRegionSizeBudget / VectorizableTree.size();
12488+ auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12489+ Instruction *Last) {
12490+ assert(First->getParent() == Last->getParent() &&
12491+ "Expected instructions in same block.");
12492+ if (Last == First || Last->comesBefore(First))
12493+ return true;
12494+ BasicBlock::const_reverse_iterator InstIt =
12495+ ++First->getIterator().getReverse(),
12496+ PrevInstIt =
12497+ Last->getIterator().getReverse();
12498+ auto It = CheckedInstructions.find(Last);
12499+ if (It != CheckedInstructions.end()) {
12500+ const Instruction *Checked = It->second.getPointer();
12501+ if (Checked == First || Checked->comesBefore(First))
12502+ return It->second.getInt() != 0;
12503+ PrevInstIt = Checked->getIterator().getReverse();
12504+ }
12505+ SmallVector<const Instruction *> LastInstsInRange(1, Last);
12506+ while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
1253112507 // Debug information does not impact spill cost.
1253212508 // Vectorized calls, represented as vector intrinsics, do not impact spill
1253312509 // cost.
1253412510 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12535- CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12536- NumCalls++;
12511+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12512+ for (const Instruction *LastInst : LastInstsInRange)
12513+ CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12514+ return false;
12515+ }
12516+ if (LastInstructions.contains(&*PrevInstIt))
12517+ LastInstsInRange.push_back(&*PrevInstIt);
1253712518
1253812519 ++PrevInstIt;
12520+ ++Budget;
1253912521 }
12540-
12541- if (NumCalls) {
12542- SmallVector<Type *, 4> EntriesTypes;
12543- for (const TreeEntry *TE : LiveEntries) {
12544- auto *ScalarTy = TE->getMainOp()->getType();
12545- auto It = MinBWs.find(TE);
12546- if (It != MinBWs.end())
12547- ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12548- EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
12522+ for (const Instruction *LastInst : LastInstsInRange)
12523+ CheckedInstructions.try_emplace(
12524+ LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12525+ Budget <= BudgetLimit ? 1 : 0);
12526+ return Budget <= BudgetLimit;
12527+ };
12528+ auto AddCosts = [&](const TreeEntry *Op) {
12529+ Type *ScalarTy = Op->Scalars.front()->getType();
12530+ auto It = MinBWs.find(Op);
12531+ if (It != MinBWs.end())
12532+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12533+ auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12534+ Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12535+ if (ScalarTy->isVectorTy()) {
12536+ // Handle revec dead vector instructions.
12537+ Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12538+ }
12539+ };
12540+ SmallDenseMap<const BasicBlock *, bool> BlocksToCalls;
12541+ auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12542+ BasicBlock *OpParent) {
12543+ SmallVector<BasicBlock *> Worklist;
12544+ if (Pred)
12545+ Worklist.push_back(Pred);
12546+ else
12547+ Worklist.append(pred_begin(Root), pred_end(Root));
12548+ SmallPtrSet<const BasicBlock *, 16> Visited;
12549+ while (!Worklist.empty()) {
12550+ BasicBlock *BB = Worklist.pop_back_val();
12551+ if (BB == OpParent || !Visited.insert(BB).second)
12552+ continue;
12553+ if (auto It = BlocksToCalls.find(BB); It != BlocksToCalls.end()) {
12554+ Worklist.append(pred_begin(BB), pred_end(BB));
12555+ if (!It->second)
12556+ return false;
12557+ continue;
12558+ }
12559+ BlocksToCalls[BB] = false;
12560+ if (BB->sizeWithoutDebug() > ScheduleRegionSizeBudget)
12561+ return false;
12562+ Budget += BB->sizeWithoutDebug();
12563+ if (Budget > BudgetLimit)
12564+ return false;
12565+ if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12566+ BB->getTerminator()))
12567+ return false;
12568+ BlocksToCalls[BB] = true;
12569+ Worklist.append(pred_begin(BB), pred_end(BB));
12570+ }
12571+ return true;
12572+ };
12573+ SmallVector<const TreeEntry *> LiveEntries(1, Root);
12574+ while (!LiveEntries.empty()) {
12575+ const TreeEntry *Entry = LiveEntries.pop_back_val();
12576+ SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12577+ if (Operands.empty())
12578+ continue;
12579+ Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12580+ for (const TreeEntry *Op : Operands) {
12581+ if (!Op->isGather())
12582+ LiveEntries.push_back(Op);
12583+ BasicBlock *Parent = Entry->getMainOp()->getParent();
12584+ if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12585+ (Op->isGather() && allConstant(Op->Scalars)))
12586+ continue;
12587+ Budget = 0;
12588+ BasicBlock *Pred = Entry->getOpcode() == Instruction::PHI
12589+ ? cast<PHINode>(Entry->getMainOp())
12590+ ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx)
12591+ : nullptr;
12592+ BasicBlock *OpParent;
12593+ Instruction *OpLastInst;
12594+ if (Op->isGather()) {
12595+ assert(Entry->getOpcode() == Instruction::PHI &&
12596+ "Expected phi node only.");
12597+ OpParent = cast<PHINode>(Entry->getMainOp())
12598+ ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12599+ OpLastInst = OpParent->getTerminator();
12600+ for (Value *V : Op->Scalars) {
12601+ auto *Inst = dyn_cast<Instruction>(V);
12602+ if (!Inst)
12603+ continue;
12604+ if (isVectorized(V)) {
12605+ OpParent = Inst->getParent();
12606+ OpLastInst = Inst;
12607+ break;
12608+ }
12609+ }
12610+ } else {
12611+ OpLastInst = EntriesToLastInstruction.at(Op);
12612+ OpParent = Op->getMainOp()->getParent();
12613+ }
12614+ // Check the call instructions within the same basic blocks.
12615+ if (OpParent == Parent) {
12616+ if (Entry->getOpcode() == Instruction::PHI) {
12617+ if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12618+ AddCosts(Op);
12619+ continue;
12620+ }
12621+ if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12622+ AddCosts(Op);
12623+ continue;
12624+ }
12625+ // Check for call instruction in between blocks.
12626+ // 1. Check entry's block to the head.
12627+ if (Entry->getOpcode() != Instruction::PHI &&
12628+ !CheckForNonVecCallsInSameBlock(
12629+ &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12630+ LastInst)) {
12631+ AddCosts(Op);
12632+ continue;
12633+ }
12634+ // 2. Check op's block from the end.
12635+ if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12636+ OpParent->getTerminator())) {
12637+ AddCosts(Op);
12638+ continue;
12639+ }
12640+ // 3. Check the predecessors of entry's block till op's block.
12641+ if (!CheckPredecessors(Parent, Pred, OpParent)) {
12642+ AddCosts(Op);
12643+ continue;
1254912644 }
12550- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1255112645 }
12552-
12553- Prev = TE;
1255412646 }
1255512647
1255612648 return Cost;
0 commit comments