@@ -12165,112 +12165,224 @@ InstructionCost BoUpSLP::getSpillCost() const {
1216512165 // live. When we see a call instruction that is not part of our tree,
1216612166 // query TTI to see if there is a cost to keeping values live over it
1216712167 // (for example, if spills and fills are required).
12168- unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12169- InstructionCost Cost = 0;
1217012168
12171- SmallPtrSet<Instruction *, 4> LiveValues;
12172- Instruction *PrevInst = nullptr;
12169+ const TreeEntry *Root = VectorizableTree.front().get();
12170+ if (Root->isGather())
12171+ return 0;
1217312172
12174- // The entries in VectorizableTree are not necessarily ordered by their
12175- // position in basic blocks. Collect them and order them by dominance so later
12176- // instructions are guaranteed to be visited first. For instructions in
12177- // different basic blocks, we only scan to the beginning of the block, so
12178- // their order does not matter, as long as all instructions in a basic block
12179- // are grouped together. Using dominance ensures a deterministic order.
12180- SmallVector<Instruction *, 16> OrderedScalars;
12173+ InstructionCost Cost = 0;
12174+ SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12175+ EntriesToOperands;
12176+ SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12177+ SmallPtrSet<const Instruction *, 8> LastInstructions;
1218112178 for (const auto &TEPtr : VectorizableTree) {
12182- if (TEPtr->State != TreeEntry::Vectorize)
12183- continue;
12184- Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12185- if (!Inst)
12186- continue;
12187- OrderedScalars.push_back(Inst);
12188- }
12189- llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12190- auto *NodeA = DT->getNode(A->getParent());
12191- auto *NodeB = DT->getNode(B->getParent());
12192- assert(NodeA && "Should only process reachable instructions");
12193- assert(NodeB && "Should only process reachable instructions");
12194- assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195- "Different nodes should have different DFS numbers");
12196- if (NodeA != NodeB)
12197- return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198- return B->comesBefore(A);
12199- });
12200-
12201- for (Instruction *Inst : OrderedScalars) {
12202- if (!PrevInst) {
12203- PrevInst = Inst;
12204- continue;
12179+ if (!TEPtr->isGather()) {
12180+ Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12181+ EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12182+ LastInstructions.insert(LastInst);
1220512183 }
12184+ if (TEPtr->UserTreeIndex)
12185+ EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12186+ }
1220612187
12207- // Update LiveValues.
12208- LiveValues.erase(PrevInst);
12209- for (auto &J : PrevInst->operands()) {
12210- if (isa<Instruction>(&*J) && isVectorized(&*J))
12211- LiveValues.insert(cast<Instruction>(&*J));
12188+ auto NoCallIntrinsic = [this](const Instruction *I) {
12189+ const auto *II = dyn_cast<IntrinsicInst>(I);
12190+ if (!II)
12191+ return false;
12192+ if (II->isAssumeLikeIntrinsic())
12193+ return true;
12194+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12195+ InstructionCost IntrCost =
12196+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12197+ InstructionCost CallCost = TTI->getCallInstrCost(
12198+ nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12199+ return IntrCost < CallCost;
12200+ };
12201+
12202+ // Maps last instruction in the entry to the last instruction for the one of
12203+ // operand entries and the flag. If the flag is true, there are no calls in
12204+ // between these instructions.
12205+ SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12206+ CheckedInstructions;
12207+ unsigned Budget = 0;
12208+ const unsigned BudgetLimit =
12209+ ScheduleRegionSizeBudget / VectorizableTree.size();
12210+ auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12211+ const Instruction *Last) {
12212+ assert(First->getParent() == Last->getParent() &&
12213+ "Expected instructions in same block.");
12214+ if (auto It = CheckedInstructions.find(Last);
12215+ It != CheckedInstructions.end()) {
12216+ const Instruction *Checked = It->second.getPointer();
12217+ if (Checked == First || Checked->comesBefore(First))
12218+ return It->second.getInt() != 0;
12219+ Last = Checked;
12220+ } else if (Last == First || Last->comesBefore(First)) {
12221+ return true;
1221212222 }
12223+ BasicBlock::const_reverse_iterator InstIt =
12224+ ++First->getIterator().getReverse(),
12225+ PrevInstIt =
12226+ Last->getIterator().getReverse();
12227+ SmallVector<const Instruction *> LastInstsInRange;
12228+ while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
12229+ // Debug information does not impact spill cost.
12230+ // Vectorized calls, represented as vector intrinsics, do not impact spill
12231+ // cost.
12232+ if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12233+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12234+ for (const Instruction *LastInst : LastInstsInRange)
12235+ CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12236+ return false;
12237+ }
12238+ if (LastInstructions.contains(&*PrevInstIt))
12239+ LastInstsInRange.push_back(&*PrevInstIt);
1221312240
12214- LLVM_DEBUG({
12215- dbgs() << "SLP: #LV: " << LiveValues.size();
12216- for (auto *X : LiveValues)
12217- dbgs() << " " << X->getName();
12218- dbgs() << ", Looking at ";
12219- Inst->dump();
12241+ ++PrevInstIt;
12242+ ++Budget;
12243+ }
12244+ for (const Instruction *LastInst : LastInstsInRange)
12245+ CheckedInstructions.try_emplace(
12246+ LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12247+ Budget <= BudgetLimit ? 1 : 0);
12248+ return Budget <= BudgetLimit;
12249+ };
12250+ auto AddCosts = [&](const TreeEntry *Op) {
12251+ Type *ScalarTy = Op->Scalars.front()->getType();
12252+ auto It = MinBWs.find(Op);
12253+ if (It != MinBWs.end())
12254+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12255+ auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12256+ Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12257+ if (ScalarTy->isVectorTy()) {
12258+ // Handle revec dead vector instructions.
12259+ Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12260+ }
12261+ };
12262+ // Memoize the relationship between blocks, i.e. if there is (at least one)
12263+ // non-vectorized call between the blocks. This allows to skip the analysis of
12264+ // the same block paths multiple times.
12265+ SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12266+ ParentOpParentToPreds;
12267+ auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12268+ BasicBlock *OpParent) {
12269+ auto Key = std::make_pair(Root, OpParent);
12270+ if (auto It = ParentOpParentToPreds.find(Key);
12271+ It != ParentOpParentToPreds.end())
12272+ return It->second;
12273+ SmallVector<BasicBlock *> Worklist;
12274+ if (Pred)
12275+ Worklist.push_back(Pred);
12276+ else
12277+ Worklist.append(pred_begin(Root), pred_end(Root));
12278+ SmallPtrSet<const BasicBlock *, 16> Visited;
12279+ SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12280+ ParentsPairsToAdd;
12281+ bool Res = false;
12282+ auto Cleanup = make_scope_exit([&]() {
12283+ for (const auto &KeyPair : ParentsPairsToAdd) {
12284+ assert(!ParentOpParentToPreds.contains(KeyPair) &&
12285+ "Should not have been added before.");
12286+ ParentOpParentToPreds.try_emplace(KeyPair, Res);
12287+ }
1222012288 });
12221-
12222- // Now find the sequence of instructions between PrevInst and Inst.
12223- unsigned NumCalls = 0;
12224- BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12225- PrevInstIt =
12226- PrevInst->getIterator().getReverse();
12227- while (InstIt != PrevInstIt) {
12228- if (PrevInstIt == PrevInst->getParent()->rend()) {
12229- PrevInstIt = Inst->getParent()->rbegin();
12289+ while (!Worklist.empty()) {
12290+ BasicBlock *BB = Worklist.pop_back_val();
12291+ if (BB == OpParent || !Visited.insert(BB).second)
1223012292 continue;
12293+ auto Pair = std::make_pair(BB, OpParent);
12294+ if (auto It = ParentOpParentToPreds.find(Pair);
12295+ It != ParentOpParentToPreds.end()) {
12296+ Res = It->second;
12297+ return Res;
1223112298 }
12232-
12233- auto NoCallIntrinsic = [this](Instruction *I) {
12234- auto *II = dyn_cast<IntrinsicInst>(I);
12235- if (!II)
12236- return false;
12237- if (II->isAssumeLikeIntrinsic())
12238- return true;
12239- FastMathFlags FMF;
12240- SmallVector<Type *, 4> Tys;
12241- for (auto &ArgOp : II->args())
12242- Tys.push_back(ArgOp->getType());
12243- if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12244- FMF = FPMO->getFastMathFlags();
12245- IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12246- FMF);
12247- InstructionCost IntrCost =
12248- TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12249- InstructionCost CallCost = TTI->getCallInstrCost(
12250- nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12251- return IntrCost < CallCost;
12252- };
12253-
12254- // Debug information does not impact spill cost.
12255- if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256- &*PrevInstIt != PrevInst)
12257- NumCalls++;
12258-
12259- ++PrevInstIt;
12299+ ParentsPairsToAdd.insert(Pair);
12300+ unsigned BlockSize = BB->size();
12301+ if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12302+ return Res;
12303+ Budget += BlockSize;
12304+ if (Budget > BudgetLimit)
12305+ return Res;
12306+ if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12307+ BB->getTerminator()))
12308+ return Res;
12309+ Worklist.append(pred_begin(BB), pred_end(BB));
1226012310 }
12261-
12262- if (NumCalls) {
12263- SmallVector<Type *, 4> V;
12264- for (auto *II : LiveValues) {
12265- auto *ScalarTy = II->getType();
12266- if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267- ScalarTy = VectorTy->getElementType();
12268- V.push_back(getWidenedType(ScalarTy, BundleWidth));
12311+ Res = true;
12312+ return Res;
12313+ };
12314+ SmallVector<const TreeEntry *> LiveEntries(1, Root);
12315+ while (!LiveEntries.empty()) {
12316+ const TreeEntry *Entry = LiveEntries.pop_back_val();
12317+ SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12318+ if (Operands.empty())
12319+ continue;
12320+ Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12321+ BasicBlock *Parent = LastInst->getParent();
12322+ for (const TreeEntry *Op : Operands) {
12323+ if (!Op->isGather())
12324+ LiveEntries.push_back(Op);
12325+ if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12326+ (Op->isGather() && allConstant(Op->Scalars)))
12327+ continue;
12328+ Budget = 0;
12329+ BasicBlock *Pred = nullptr;
12330+ if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12331+ Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12332+ BasicBlock *OpParent;
12333+ Instruction *OpLastInst;
12334+ if (Op->isGather()) {
12335+ assert(Entry->getOpcode() == Instruction::PHI &&
12336+ "Expected phi node only.");
12337+ OpParent = cast<PHINode>(Entry->getMainOp())
12338+ ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12339+ OpLastInst = OpParent->getTerminator();
12340+ for (Value *V : Op->Scalars) {
12341+ auto *Inst = dyn_cast<Instruction>(V);
12342+ if (!Inst)
12343+ continue;
12344+ if (isVectorized(V)) {
12345+ OpParent = Inst->getParent();
12346+ OpLastInst = Inst;
12347+ break;
12348+ }
12349+ }
12350+ } else {
12351+ OpLastInst = EntriesToLastInstruction.at(Op);
12352+ OpParent = OpLastInst->getParent();
12353+ }
12354+ // Check the call instructions within the same basic blocks.
12355+ if (OpParent == Parent) {
12356+ if (Entry->getOpcode() == Instruction::PHI) {
12357+ if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12358+ AddCosts(Op);
12359+ continue;
12360+ }
12361+ if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12362+ AddCosts(Op);
12363+ continue;
12364+ }
12365+ // Check for call instruction in between blocks.
12366+ // 1. Check entry's block to the head.
12367+ if (Entry->getOpcode() != Instruction::PHI &&
12368+ !CheckForNonVecCallsInSameBlock(
12369+ &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12370+ LastInst)) {
12371+ AddCosts(Op);
12372+ continue;
12373+ }
12374+ // 2. Check op's block from the end.
12375+ if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12376+ OpParent->getTerminator())) {
12377+ AddCosts(Op);
12378+ continue;
12379+ }
12380+ // 3. Check the predecessors of entry's block till op's block.
12381+ if (!CheckPredecessors(Parent, Pred, OpParent)) {
12382+ AddCosts(Op);
12383+ continue;
1226912384 }
12270- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
1227112385 }
12272-
12273- PrevInst = Inst;
1227412386 }
1227512387
1227612388 return Cost;
@@ -12778,8 +12890,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1277812890 }
1277912891 }
1278012892
12781- InstructionCost SpillCost = getSpillCost();
12782- Cost += SpillCost + ExtractCost;
12893+ Cost += ExtractCost;
1278312894 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
1278412895 bool) {
1278512896 InstructionCost C = 0;
@@ -12918,12 +13029,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1291813029 }
1291913030 }
1292013031
13032+ std::optional<InstructionCost> SpillCost;
13033+ if (Cost < -SLPCostThreshold) {
13034+ SpillCost = getSpillCost();
13035+ Cost += *SpillCost;
13036+ }
1292113037#ifndef NDEBUG
1292213038 SmallString<256> Str;
1292313039 {
1292413040 raw_svector_ostream OS(Str);
12925- OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12926- << "SLP: Extract Cost = " << ExtractCost << ".\n"
13041+ OS << "SLP: Spill Cost = ";
13042+ if (SpillCost)
13043+ OS << *SpillCost;
13044+ else
13045+ OS << "<skipped>";
13046+ OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
1292713047 << "SLP: Total Cost = " << Cost << ".\n";
1292813048 }
1292913049 LLVM_DEBUG(dbgs() << Str);
0 commit comments