@@ -111,10 +111,17 @@ static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
111111 cl::desc("Number limit for gluing ld/st of memcpy."),
112112 cl::Hidden, cl::init(0));
113113
114+ static cl::opt<unsigned>
115+ MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
116+ cl::desc("DAG combiner limit number of steps when searching DAG "
117+ "for predecessor nodes"));
118+
114119static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
115120 LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
116121}
117122
123+ unsigned SelectionDAG::getHasPredecessorMaxSteps() { return MaxSteps; }
124+
118125//===----------------------------------------------------------------------===//
119126// ConstantFPSDNode Class
120127//===----------------------------------------------------------------------===//
@@ -2474,6 +2481,51 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
24742481 return Subvectors[0];
24752482}
24762483
2484+ /// Given a store node \p StoreNode, return true if it is safe to fold that node
2485+ /// into \p FPNode, which expands to a library call with output pointers.
2486+ static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
2487+ SDNode *FPNode) {
2488+ SmallVector<const SDNode *, 8> Worklist;
2489+ SmallVector<const SDNode *, 8> DeferredNodes;
2490+ SmallPtrSet<const SDNode *, 16> Visited;
2491+
2492+ // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
2493+ for (SDValue Op : StoreNode->ops())
2494+ if (Op.getNode() != FPNode)
2495+ Worklist.push_back(Op.getNode());
2496+
2497+ unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
2498+ while (!Worklist.empty()) {
2499+ const SDNode *Node = Worklist.pop_back_val();
2500+ auto [_, Inserted] = Visited.insert(Node);
2501+ if (!Inserted)
2502+ continue;
2503+
2504+ if (MaxSteps > 0 && Visited.size() >= MaxSteps)
2505+ return false;
2506+
2507+ // Reached the FPNode (would result in a cycle).
2508+ // OR Reached CALLSEQ_START (would result in nested call sequences).
2509+ if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
2510+ return false;
2511+
2512+ if (Node->getOpcode() == ISD::CALLSEQ_END) {
2513+ // Defer looking into call sequences (so we can check we're outside one).
2514+ // We still need to look through these for the predecessor check.
2515+ DeferredNodes.push_back(Node);
2516+ continue;
2517+ }
2518+
2519+ for (SDValue Op : Node->ops())
2520+ Worklist.push_back(Op.getNode());
2521+ }
2522+
2523+ // True if we're outside a call sequence and don't have the FPNode as a
2524+ // predecessor. No cycles or nested call sequences possible.
2525+ return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes,
2526+ MaxSteps);
2527+ }
2528+
24772529bool SelectionDAG::expandMultipleResultFPLibCall(
24782530 RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
24792531 std::optional<unsigned> CallRetResNo) {
@@ -2502,26 +2554,35 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25022554
25032555 // Find users of the node that store the results (and share input chains). The
25042556 // destination pointers can be used instead of creating stack allocations.
2505- // FIXME: This should allow stores with the same chains (not just the entry
2506- // chain), but there's a risk the store is within a (CALLSEQ_START,
2507- // CALLSEQ_END) pair, which after this expansion will lead to nested call
2508- // sequences.
2509- SDValue InChain = getEntryNode();
2557+ SDValue StoresInChain;
25102558 SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
25112559 for (SDNode *User : Node->uses()) {
25122560 if (!ISD::isNormalStore(User))
25132561 continue;
25142562 auto *ST = cast<StoreSDNode>(User);
25152563 SDValue StoreValue = ST->getValue();
25162564 unsigned ResNo = StoreValue.getResNo();
2565+ // Ensure the store corresponds to an output pointer.
2566+ if (CallRetResNo == ResNo)
2567+ continue;
2568+ // Ensure the store to the default address space and not atomic or volatile.
2569+ if (!ST->isSimple() || ST->getAddressSpace() != 0)
2570+ continue;
2571+ // Ensure all store chains are the same (so they don't alias).
2572+ if (StoresInChain && ST->getChain() != StoresInChain)
2573+ continue;
2574+ // Ensure the store is properly aligned.
25172575 Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
2518- if (CallRetResNo == ResNo || !ST->isSimple() ||
2519- ST->getAddressSpace() != 0 ||
2520- ST->getAlign() <
2521- getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
2522- ST->getChain() != InChain)
2576+ if (ST->getAlign() <
2577+ getDataLayout().getABITypeAlign(StoreType->getScalarType()))
2578+ continue;
2579+ // Avoid:
2580+ // 1. Creating cyclic dependencies.
2581+ // 2. Expanding the node to a call within a call sequence.
2582+ if (!canFoldStoreIntoLibCallOutputPointers(ST, Node))
25232583 continue;
25242584 ResultStores[ResNo] = ST;
2585+ StoresInChain = ST->getChain();
25252586 }
25262587
25272588 TargetLowering::ArgListTy Args;
@@ -2563,6 +2624,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25632624 Type *RetType = CallRetResNo.has_value()
25642625 ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
25652626 : Type::getVoidTy(Ctx);
2627+ SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
25662628 SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
25672629 TLI->getPointerTy(getDataLayout()));
25682630 TargetLowering::CallLoweringInfo CLI(*this);
0 commit comments