alexey-bataev · alexey-bataev · Mar 4, 2025
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12449,111 +12449,224 @@ InstructionCost BoUpSLP::getSpillCost() {
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  InstructionCost Cost = 0;
 
-  SmallPtrSet<const TreeEntry *, 4> LiveEntries;
-  const TreeEntry *Prev = nullptr;
+  const TreeEntry *Root = VectorizableTree.front().get();
+  if (Root->isGather())
+    return 0;
 
-  // The entries in VectorizableTree are not necessarily ordered by their
-  // position in basic blocks. Collect them and order them by dominance so later
-  // instructions are guaranteed to be visited first. For instructions in
-  // different basic blocks, we only scan to the beginning of the block, so
-  // their order does not matter, as long as all instructions in a basic block
-  // are grouped together. Using dominance ensures a deterministic order.
-  SmallVector<TreeEntry *, 16> OrderedEntries;
+  InstructionCost Cost = 0;
+  SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
+      EntriesToOperands;
+  SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
+  SmallPtrSet<const Instruction *, 8> LastInstructions;
   for (const auto &TEPtr : VectorizableTree) {
-    if (TEPtr->isGather())
-      continue;
-    OrderedEntries.push_back(TEPtr.get());
-  }
-  llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
-                                        const TreeEntry *TB) {
-    Instruction &A = getLastInstructionInBundle(TA);
-    Instruction &B = getLastInstructionInBundle(TB);
-    auto *NodeA = DT->getNode(A.getParent());
-    auto *NodeB = DT->getNode(B.getParent());
-    assert(NodeA && "Should only process reachable instructions");
-    assert(NodeB && "Should only process reachable instructions");
-    assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
-           "Different nodes should have different DFS numbers");
-    if (NodeA != NodeB)
-      return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
-    return B.comesBefore(&A);
-  });
-
-  for (const TreeEntry *TE : OrderedEntries) {
-    if (!Prev) {
-      Prev = TE;
-      continue;
-    }
-
-    LiveEntries.erase(Prev);
-    for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
-      const TreeEntry *Op = getVectorizedOperand(Prev, I);
-      if (!Op)
-        continue;
-      assert(!Op->isGather() && "Expected vectorized operand.");
-      LiveEntries.insert(Op);
+    if (!TEPtr->isGather()) {
+      Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
+      EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
+      LastInstructions.insert(LastInst);
     }
+    if (TEPtr->UserTreeIndex)
+      EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
+  }
 
-    LLVM_DEBUG({
-      dbgs() << "SLP: #LV: " << LiveEntries.size();
-      for (auto *X : LiveEntries)
-        X->dump();
-      dbgs() << ", Looking at ";
-      TE->dump();
-    });
-
-    // Now find the sequence of instructions between PrevInst and Inst.
-    unsigned NumCalls = 0;
-    const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
-    BasicBlock::const_reverse_iterator
-        InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
-        PrevInstIt = PrevInst->getIterator().getReverse();
-    while (InstIt != PrevInstIt) {
-      if (PrevInstIt == PrevInst->getParent()->rend()) {
-        PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
-        continue;
-      }
-
-      auto NoCallIntrinsic = [this](const Instruction *I) {
-        const auto *II = dyn_cast<IntrinsicInst>(I);
-        if (!II)
-          return false;
-        if (II->isAssumeLikeIntrinsic())
-          return true;
-        IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
-        InstructionCost IntrCost =
-            TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
-        InstructionCost CallCost =
-            TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
-                                  TTI::TCK_RecipThroughput);
-        return IntrCost < CallCost;
-      };
+  auto NoCallIntrinsic = [this](const Instruction *I) {
+    const auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II)
+      return false;
+    if (II->isAssumeLikeIntrinsic())
+      return true;
+    IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
+    InstructionCost IntrCost =
+        TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
+    InstructionCost CallCost = TTI->getCallInstrCost(
+        nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
+    return IntrCost < CallCost;
+  };
 
+  // Maps last instruction in the entry to the last instruction for the one of
+  // operand entries and the flag. If the flag is true, there are no calls in
+  // between these instructions.
+  SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
+      CheckedInstructions;
+  unsigned Budget = 0;
+  const unsigned BudgetLimit =
+      ScheduleRegionSizeBudget / VectorizableTree.size();
+  auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
+                                            const Instruction *Last) {
+    assert(First->getParent() == Last->getParent() &&
+           "Expected instructions in same block.");
+    if (auto It = CheckedInstructions.find(Last);
+        It != CheckedInstructions.end()) {
+      const Instruction *Checked = It->second.getPointer();
+      if (Checked == First || Checked->comesBefore(First))
+        return It->second.getInt() != 0;
+      Last = Checked;
+    } else if (Last == First || Last->comesBefore(First)) {
+      return true;
+    }
+    BasicBlock::const_reverse_iterator InstIt =
+                                           ++First->getIterator().getReverse(),
+                                       PrevInstIt =
+                                           Last->getIterator().getReverse();
+    SmallVector<const Instruction *> LastInstsInRange;
+    while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
       // Debug information does not impact spill cost.
       // Vectorized calls, represented as vector intrinsics, do not impact spill
       // cost.
       if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
-          CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
-        NumCalls++;
+          CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
+        for (const Instruction *LastInst : LastInstsInRange)
+          CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
+        return false;
+      }
+      if (LastInstructions.contains(&*PrevInstIt))
+        LastInstsInRange.push_back(&*PrevInstIt);
 
       ++PrevInstIt;
+      ++Budget;
     }
-
-    if (NumCalls) {
-      SmallVector<Type *, 4> EntriesTypes;
-      for (const TreeEntry *TE : LiveEntries) {
-        auto *ScalarTy = TE->getMainOp()->getType();
-        auto It = MinBWs.find(TE);
-        if (It != MinBWs.end())
-          ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
-        EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
+    for (const Instruction *LastInst : LastInstsInRange)
+      CheckedInstructions.try_emplace(
+          LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
+          Budget <= BudgetLimit ? 1 : 0);
+    return Budget <= BudgetLimit;
+  };
+  auto AddCosts = [&](const TreeEntry *Op) {
+    Type *ScalarTy = Op->Scalars.front()->getType();
+    auto It = MinBWs.find(Op);
+    if (It != MinBWs.end())
+      ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
+    auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
+    Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
+    if (ScalarTy->isVectorTy()) {
+      // Handle revec dead vector instructions.
+      Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
+    }
+  };
+  // Memoize the relationship between blocks, i.e. if there is (at least one)
+  // non-vectorized call between the blocks. This allows to skip the analysis of
+  // the same block paths multiple times.
+  SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
+      ParentOpParentToPreds;
+  auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
+                               BasicBlock *OpParent) {
+    auto Key = std::make_pair(Root, OpParent);
+    if (auto It = ParentOpParentToPreds.find(Key);
+        It != ParentOpParentToPreds.end())
+      return It->second;
+    SmallVector<BasicBlock *> Worklist;
+    if (Pred)
+      Worklist.push_back(Pred);
+    else
+      Worklist.append(pred_begin(Root), pred_end(Root));
+    SmallPtrSet<const BasicBlock *, 16> Visited;
+    SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
+        ParentsPairsToAdd;
+    bool Res = false;
+    auto Cleanup = make_scope_exit([&]() {
+      for (const auto &KeyPair : ParentsPairsToAdd) {
+        assert(!ParentOpParentToPreds.contains(KeyPair) &&
+               "Should not have been added before.");
+        ParentOpParentToPreds.try_emplace(KeyPair, Res);
+      }
+    });
+    while (!Worklist.empty()) {
+      BasicBlock *BB = Worklist.pop_back_val();
+      if (BB == OpParent || !Visited.insert(BB).second)
+        continue;
+      if (auto It = ParentOpParentToPreds.find(std::make_pair(BB, OpParent));
+          It != ParentOpParentToPreds.end()) {
+        Res = It->second;
+        return Res;
+      }
+      ParentsPairsToAdd.insert(std::make_pair(BB, OpParent));
+      unsigned BlockSize = BB->size();
+      if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
+        return Res;
+      Budget += BlockSize;
+      if (Budget > BudgetLimit)
+        return Res;
+      if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
+                                          BB->getTerminator()))
+        return Res;
+      Worklist.append(pred_begin(BB), pred_end(BB));
+    }
+    Res = true;
+    return Res;
+  };
+  SmallVector<const TreeEntry *> LiveEntries(1, Root);
+  while (!LiveEntries.empty()) {
+    const TreeEntry *Entry = LiveEntries.pop_back_val();
+    SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
+    if (Operands.empty())
+      continue;
+    Instruction *LastInst = EntriesToLastInstruction.at(Entry);
+    BasicBlock *Parent = LastInst->getParent();
+    for (const TreeEntry *Op : Operands) {
+      if (!Op->isGather())
+        LiveEntries.push_back(Op);
+      if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
+          (Op->isGather() && allConstant(Op->Scalars)))
+        continue;
+      Budget = 0;
+      BasicBlock *Pred = Entry->getOpcode() == Instruction::PHI
+                             ? cast<PHINode>(Entry->getMainOp())
+                                   ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx)
+                             : nullptr;
+      BasicBlock *OpParent;
+      Instruction *OpLastInst;
+      if (Op->isGather()) {
+        assert(Entry->getOpcode() == Instruction::PHI &&
+               "Expected phi node only.");
+        OpParent = cast<PHINode>(Entry->getMainOp())
+                       ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
+        OpLastInst = OpParent->getTerminator();
+        for (Value *V : Op->Scalars) {
+          auto *Inst = dyn_cast<Instruction>(V);
+          if (!Inst)
+            continue;
+          if (isVectorized(V)) {
+            OpParent = Inst->getParent();
+            OpLastInst = Inst;
+            break;
+          }
+        }
+      } else {
+        OpLastInst = EntriesToLastInstruction.at(Op);
+        OpParent = OpLastInst->getParent();
+      }
+      // Check the call instructions within the same basic blocks.
+      if (OpParent == Parent) {
+        if (Entry->getOpcode() == Instruction::PHI) {
+          if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
+            AddCosts(Op);
+          continue;
+        }
+        if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
+          AddCosts(Op);
+        continue;
+      }
+      // Check for call instruction in between blocks.
+      // 1. Check entry's block to the head.
+      if (Entry->getOpcode() != Instruction::PHI &&
+          !CheckForNonVecCallsInSameBlock(
+              &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
+              LastInst)) {
+        AddCosts(Op);
+        continue;
+      }
+      // 2. Check op's block from the end.
+      if (!CheckForNonVecCallsInSameBlock(OpLastInst,
+                                          OpParent->getTerminator())) {
+        AddCosts(Op);
+        continue;
+      }
+      // 3. Check the predecessors of entry's block till op's block.
+      if (!CheckPredecessors(Parent, Pred, OpParent)) {
+        AddCosts(Op);
+        continue;
       }
-      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
     }
-
-    Prev = TE;
   }
 
   return Cost;
@@ -13061,8 +13174,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     }
   }
 
-  InstructionCost SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  Cost += ExtractCost;
   auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
                                     bool) {
     InstructionCost C = 0;
@@ -13201,12 +13313,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     }
   }
 
+  InstructionCost SpillCost = InstructionCost::getInvalid();
+  if (Cost < -SLPCostThreshold) {
+    SpillCost = getSpillCost();
+    Cost += SpillCost;
+  }
 #ifndef NDEBUG
   SmallString<256> Str;
   {
     raw_svector_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
+    OS << "SLP: Spill Cost = ";
+    if (SpillCost.isValid())
+      OS << SpillCost;
+    else
+      OS << "<skipped>";
+    OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
        << "SLP: Total Cost = " << Cost << ".\n";
   }
   LLVM_DEBUG(dbgs() << Str);

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -1740,7 +1740,9 @@ entry:
 define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; CHECK-NEXT:    br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-NEXT:    [[Z1:%.*]] = call float @fabsf(float [[Z0]])
 ; CHECK-NEXT:    br label [[BAZ]]
 ; CHECK:       baz:
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; DEFAULT-LABEL: define void @f
 ; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; DEFAULT-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; DEFAULT-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; DEFAULT-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; DEFAULT-NEXT:    br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; DEFAULT:       foo:
 ; DEFAULT-NEXT:    [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; DEFAULT-NEXT:    [[Z1:%.*]] = call float @fabsf(float [[Z0]])
 ; DEFAULT-NEXT:    br label [[BAZ]]
 ; DEFAULT:       baz:
-; DEFAULT-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; DEFAULT-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; DEFAULT-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; DEFAULT-NEXT:    ret void
 ;
   %x0 = load i64, ptr %p