@@ -12449,111 +12449,224 @@ InstructionCost BoUpSLP::getSpillCost() {
1244912449 // live. When we see a call instruction that is not part of our tree,
1245012450 // query TTI to see if there is a cost to keeping values live over it
1245112451 // (for example, if spills and fills are required).
12452- InstructionCost Cost = 0;
1245312452
12454- SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12455- const TreeEntry *Prev = nullptr;
12453+ const TreeEntry *Root = VectorizableTree.front().get();
12454+ if (Root->isGather())
12455+ return 0;
1245612456
12457- // The entries in VectorizableTree are not necessarily ordered by their
12458- // position in basic blocks. Collect them and order them by dominance so later
12459- // instructions are guaranteed to be visited first. For instructions in
12460- // different basic blocks, we only scan to the beginning of the block, so
12461- // their order does not matter, as long as all instructions in a basic block
12462- // are grouped together. Using dominance ensures a deterministic order.
12463- SmallVector<TreeEntry *, 16> OrderedEntries;
12457+ InstructionCost Cost = 0;
12458+ SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12459+ EntriesToOperands;
12460+ SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12461+ SmallPtrSet<const Instruction *, 8> LastInstructions;
1246412462 for (const auto &TEPtr : VectorizableTree) {
12465- if (TEPtr->isGather())
12466- continue;
12467- OrderedEntries.push_back(TEPtr.get());
12468- }
12469- llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12470- const TreeEntry *TB) {
12471- Instruction &A = getLastInstructionInBundle(TA);
12472- Instruction &B = getLastInstructionInBundle(TB);
12473- auto *NodeA = DT->getNode(A.getParent());
12474- auto *NodeB = DT->getNode(B.getParent());
12475- assert(NodeA && "Should only process reachable instructions");
12476- assert(NodeB && "Should only process reachable instructions");
12477- assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12478- "Different nodes should have different DFS numbers");
12479- if (NodeA != NodeB)
12480- return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12481- return B.comesBefore(&A);
12482- });
12483-
12484- for (const TreeEntry *TE : OrderedEntries) {
12485- if (!Prev) {
12486- Prev = TE;
12487- continue;
12488- }
12489-
12490- LiveEntries.erase(Prev);
12491- for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12492- const TreeEntry *Op = getVectorizedOperand(Prev, I);
12493- if (!Op)
12494- continue;
12495- assert(!Op->isGather() && "Expected vectorized operand.");
12496- LiveEntries.insert(Op);
12463+ if (!TEPtr->isGather()) {
12464+ Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12465+ EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12466+ LastInstructions.insert(LastInst);
1249712467 }
12468+ if (TEPtr->UserTreeIndex)
12469+ EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12470+ }
1249812471
12499- LLVM_DEBUG({
12500- dbgs() << "SLP: #LV: " << LiveEntries.size();
12501- for (auto *X : LiveEntries)
12502- X->dump();
12503- dbgs() << ", Looking at ";
12504- TE->dump();
12505- });
12506-
12507- // Now find the sequence of instructions between PrevInst and Inst.
12508- unsigned NumCalls = 0;
12509- const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12510- BasicBlock::const_reverse_iterator
12511- InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12512- PrevInstIt = PrevInst->getIterator().getReverse();
12513- while (InstIt != PrevInstIt) {
12514- if (PrevInstIt == PrevInst->getParent()->rend()) {
12515- PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12516- continue;
12517- }
12518-
12519- auto NoCallIntrinsic = [this](const Instruction *I) {
12520- const auto *II = dyn_cast<IntrinsicInst>(I);
12521- if (!II)
12522- return false;
12523- if (II->isAssumeLikeIntrinsic())
12524- return true;
12525- IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12526- InstructionCost IntrCost =
12527- TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12528- InstructionCost CallCost =
12529- TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
12530- TTI::TCK_RecipThroughput);
12531- return IntrCost < CallCost;
12532- };
12472+ auto NoCallIntrinsic = [this](const Instruction *I) {
12473+ const auto *II = dyn_cast<IntrinsicInst>(I);
12474+ if (!II)
12475+ return false;
12476+ if (II->isAssumeLikeIntrinsic())
12477+ return true;
12478+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12479+ InstructionCost IntrCost =
12480+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12481+ InstructionCost CallCost = TTI->getCallInstrCost(
12482+ nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12483+ return IntrCost < CallCost;
12484+ };
1253312485
12486+ // Maps last instruction in the entry to the last instruction for the one of
12487+ // operand entries and the flag. If the flag is true, there are no calls in
12488+ // between these instructions.
12489+ SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12490+ CheckedInstructions;
12491+ unsigned Budget = 0;
12492+ const unsigned BudgetLimit =
12493+ ScheduleRegionSizeBudget / VectorizableTree.size();
12494+ auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12495+ const Instruction *Last) {
12496+ assert(First->getParent() == Last->getParent() &&
12497+ "Expected instructions in same block.");
12498+ if (auto It = CheckedInstructions.find(Last);
12499+ It != CheckedInstructions.end()) {
12500+ const Instruction *Checked = It->second.getPointer();
12501+ if (Checked == First || Checked->comesBefore(First))
12502+ return It->second.getInt() != 0;
12503+ Last = Checked;
12504+ } else if (Last == First || Last->comesBefore(First)) {
12505+ return true;
12506+ }
12507+ BasicBlock::const_reverse_iterator InstIt =
12508+ ++First->getIterator().getReverse(),
12509+ PrevInstIt =
12510+ Last->getIterator().getReverse();
12511+ SmallVector<const Instruction *> LastInstsInRange;
12512+ while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
1253412513 // Debug information does not impact spill cost.
1253512514 // Vectorized calls, represented as vector intrinsics, do not impact spill
1253612515 // cost.
1253712516 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12538- CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12539- NumCalls++;
12517+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12518+ for (const Instruction *LastInst : LastInstsInRange)
12519+ CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12520+ return false;
12521+ }
12522+ if (LastInstructions.contains(&*PrevInstIt))
12523+ LastInstsInRange.push_back(&*PrevInstIt);
1254012524
1254112525 ++PrevInstIt;
12526+ ++Budget;
1254212527 }
12543-
12544- if (NumCalls) {
12545- SmallVector<Type *, 4> EntriesTypes;
12546- for (const TreeEntry *TE : LiveEntries) {
12547- auto *ScalarTy = TE->getMainOp()->getType();
12548- auto It = MinBWs.find(TE);
12549- if (It != MinBWs.end())
12550- ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12551- EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
12528+ for (const Instruction *LastInst : LastInstsInRange)
12529+ CheckedInstructions.try_emplace(
12530+ LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12531+ Budget <= BudgetLimit ? 1 : 0);
12532+ return Budget <= BudgetLimit;
12533+ };
12534+ auto AddCosts = [&](const TreeEntry *Op) {
12535+ Type *ScalarTy = Op->Scalars.front()->getType();
12536+ auto It = MinBWs.find(Op);
12537+ if (It != MinBWs.end())
12538+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12539+ auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12540+ Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12541+ if (ScalarTy->isVectorTy()) {
12542+ // Handle revec dead vector instructions.
12543+ Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12544+ }
12545+ };
12546+ // Memoize the relationship between blocks, i.e. if there is (at least one)
12547+ // non-vectorized call between the blocks. This allows to skip the analysis of
12548+ // the same block paths multiple times.
12549+ SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12550+ ParentOpParentToPreds;
12551+ auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12552+ BasicBlock *OpParent) {
12553+ auto Key = std::make_pair(Root, OpParent);
12554+ if (auto It = ParentOpParentToPreds.find(Key);
12555+ It != ParentOpParentToPreds.end())
12556+ return It->second;
12557+ SmallVector<BasicBlock *> Worklist;
12558+ if (Pred)
12559+ Worklist.push_back(Pred);
12560+ else
12561+ Worklist.append(pred_begin(Root), pred_end(Root));
12562+ SmallPtrSet<const BasicBlock *, 16> Visited;
12563+ SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12564+ ParentsPairsToAdd;
12565+ bool Res = false;
12566+ auto Cleanup = make_scope_exit([&]() {
12567+ for (const auto &KeyPair : ParentsPairsToAdd) {
12568+ assert(!ParentOpParentToPreds.contains(KeyPair) &&
12569+ "Should not have been added before.");
12570+ ParentOpParentToPreds.try_emplace(KeyPair, Res);
12571+ }
12572+ });
12573+ while (!Worklist.empty()) {
12574+ BasicBlock *BB = Worklist.pop_back_val();
12575+ if (BB == OpParent || !Visited.insert(BB).second)
12576+ continue;
12577+ auto Pair = std::make_pair(BB, OpParent);
12578+ if (auto It = ParentOpParentToPreds.find(Pair);
12579+ It != ParentOpParentToPreds.end()) {
12580+ Res = It->second;
12581+ return Res;
12582+ }
12583+ ParentsPairsToAdd.insert(Pair);
12584+ unsigned BlockSize = BB->size();
12585+ if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12586+ return Res;
12587+ Budget += BlockSize;
12588+ if (Budget > BudgetLimit)
12589+ return Res;
12590+ if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12591+ BB->getTerminator()))
12592+ return Res;
12593+ Worklist.append(pred_begin(BB), pred_end(BB));
12594+ }
12595+ Res = true;
12596+ return Res;
12597+ };
12598+ SmallVector<const TreeEntry *> LiveEntries(1, Root);
12599+ while (!LiveEntries.empty()) {
12600+ const TreeEntry *Entry = LiveEntries.pop_back_val();
12601+ SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12602+ if (Operands.empty())
12603+ continue;
12604+ Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12605+ BasicBlock *Parent = LastInst->getParent();
12606+ for (const TreeEntry *Op : Operands) {
12607+ if (!Op->isGather())
12608+ LiveEntries.push_back(Op);
12609+ if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12610+ (Op->isGather() && allConstant(Op->Scalars)))
12611+ continue;
12612+ Budget = 0;
12613+ BasicBlock *Pred = nullptr;
12614+ if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12615+ Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12616+ BasicBlock *OpParent;
12617+ Instruction *OpLastInst;
12618+ if (Op->isGather()) {
12619+ assert(Entry->getOpcode() == Instruction::PHI &&
12620+ "Expected phi node only.");
12621+ OpParent = cast<PHINode>(Entry->getMainOp())
12622+ ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12623+ OpLastInst = OpParent->getTerminator();
12624+ for (Value *V : Op->Scalars) {
12625+ auto *Inst = dyn_cast<Instruction>(V);
12626+ if (!Inst)
12627+ continue;
12628+ if (isVectorized(V)) {
12629+ OpParent = Inst->getParent();
12630+ OpLastInst = Inst;
12631+ break;
12632+ }
12633+ }
12634+ } else {
12635+ OpLastInst = EntriesToLastInstruction.at(Op);
12636+ OpParent = OpLastInst->getParent();
12637+ }
12638+ // Check the call instructions within the same basic blocks.
12639+ if (OpParent == Parent) {
12640+ if (Entry->getOpcode() == Instruction::PHI) {
12641+ if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12642+ AddCosts(Op);
12643+ continue;
12644+ }
12645+ if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12646+ AddCosts(Op);
12647+ continue;
12648+ }
12649+ // Check for call instruction in between blocks.
12650+ // 1. Check entry's block to the head.
12651+ if (Entry->getOpcode() != Instruction::PHI &&
12652+ !CheckForNonVecCallsInSameBlock(
12653+ &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12654+ LastInst)) {
12655+ AddCosts(Op);
12656+ continue;
12657+ }
12658+ // 2. Check op's block from the end.
12659+ if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12660+ OpParent->getTerminator())) {
12661+ AddCosts(Op);
12662+ continue;
12663+ }
12664+ // 3. Check the predecessors of entry's block till op's block.
12665+ if (!CheckPredecessors(Parent, Pred, OpParent)) {
12666+ AddCosts(Op);
12667+ continue;
1255212668 }
12553- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1255412669 }
12555-
12556- Prev = TE;
1255712670 }
1255812671
1255912672 return Cost;
@@ -13061,8 +13174,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1306113174 }
1306213175 }
1306313176
13064- InstructionCost SpillCost = getSpillCost();
13065- Cost += SpillCost + ExtractCost;
13177+ Cost += ExtractCost;
1306613178 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
1306713179 bool) {
1306813180 InstructionCost C = 0;
@@ -13201,12 +13313,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1320113313 }
1320213314 }
1320313315
13316+ std::optional<InstructionCost> SpillCost;
13317+ if (Cost < -SLPCostThreshold) {
13318+ SpillCost = getSpillCost();
13319+ Cost += *SpillCost;
13320+ }
1320413321#ifndef NDEBUG
1320513322 SmallString<256> Str;
1320613323 {
1320713324 raw_svector_ostream OS(Str);
13208- OS << "SLP: Spill Cost = " << SpillCost << ".\n"
13209- << "SLP: Extract Cost = " << ExtractCost << ".\n"
13325+ OS << "SLP: Spill Cost = ";
13326+ if (SpillCost)
13327+ OS << *SpillCost;
13328+ else
13329+ OS << "<skipped>";
13330+ OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
1321013331 << "SLP: Total Cost = " << Cost << ".\n";
1321113332 }
1321213333 LLVM_DEBUG(dbgs() << Str);
0 commit comments