Skip to content

Commit be1ca7e

Browse files
committed
[SDAG] Allow folding stack slots into sincos/frexp in more cases
This adds a new helper `CanFoldStoreIntoFPLibCall()` to check that it is safe to fold a store into a node that will expand to a library call that takes output pointers. This requires checking for two (independent) properties: 1. The store is not within a CALLSEQ_START..CALLSEQ_END pair * If it is, the expansion would lead to nested call sequences (which is invalid) 2. The node does not appear as a predecessor to the store * If it does, attempting to merge the store into the call would result in a cycle in the DAG These two properties are checked as part of the same traversal in `CanFoldStoreIntoFPLibCall()`
1 parent ef50d79 commit be1ca7e

File tree

6 files changed

+236
-174
lines changed

6 files changed

+236
-174
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,6 +2474,45 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
24742474
return Subvectors[0];
24752475
}
24762476

2477+
/// Given a store node \p StoreNode, return true if it is safe to fold that node
2478+
/// into \p FPNode, which expands to a library call with output pointers.
2479+
static bool CanFoldStoreIntoFPLibCall(StoreSDNode *StoreNode, SDNode *FPNode) {
2480+
SmallVector<const SDNode *, 8> Worklist;
2481+
SmallVector<const SDNode *, 8> DeferredNodes;
2482+
SmallPtrSet<const SDNode *, 16> Visited;
2483+
2484+
// Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
2485+
for (SDValue Op : StoreNode->ops())
2486+
if (Op.getNode() != FPNode)
2487+
Worklist.push_back(Op.getNode());
2488+
2489+
while (!Worklist.empty()) {
2490+
const SDNode *Node = Worklist.pop_back_val();
2491+
auto [_, Inserted] = Visited.insert(Node);
2492+
if (!Inserted)
2493+
continue;
2494+
2495+
// Reached the FPNode (would result in a cycle).
2496+
// OR Reached CALLSEQ_START (would result in nested call sequences).
2497+
if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
2498+
return false;
2499+
2500+
if (Node->getOpcode() == ISD::CALLSEQ_END) {
2501+
// Defer looking into call sequences (so we can check we're outside one).
2502+
// We still need to look through these for the predecessor check.
2503+
DeferredNodes.push_back(Node);
2504+
continue;
2505+
}
2506+
2507+
for (SDValue Op : Node->ops())
2508+
Worklist.push_back(Op.getNode());
2509+
}
2510+
2511+
// True if we're outside a call sequence and don't have the FPNode as a
2512+
// predecessor. No cycles or nested call sequences possible.
2513+
return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes);
2514+
}
2515+
24772516
bool SelectionDAG::expandMultipleResultFPLibCall(
24782517
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
24792518
std::optional<unsigned> CallRetResNo) {
@@ -2502,11 +2541,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25022541

25032542
// Find users of the node that store the results (and share input chains). The
25042543
// destination pointers can be used instead of creating stack allocations.
2505-
// FIXME: This should allow stores with the same chains (not just the entry
2506-
// chain), but there's a risk the store is within a (CALLSEQ_START,
2507-
// CALLSEQ_END) pair, which after this expansion will lead to nested call
2508-
// sequences.
2509-
SDValue InChain = getEntryNode();
2544+
SDValue StoresInChain{};
25102545
SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
25112546
for (SDNode *User : Node->uses()) {
25122547
if (!ISD::isNormalStore(User))
@@ -2515,13 +2550,25 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25152550
SDValue StoreValue = ST->getValue();
25162551
unsigned ResNo = StoreValue.getResNo();
25172552
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
2518-
if (CallRetResNo == ResNo || !ST->isSimple() ||
2553+
if (
2554+
// Ensure the store corresponds to an output pointer.
2555+
CallRetResNo == ResNo ||
2556+
// Ensure the store is not atomic or volatile.
2557+
!ST->isSimple() ||
2558+
// Ensure the store is in the default address space.
25192559
ST->getAddressSpace() != 0 ||
2560+
// Ensure the store is properly aligned.
25202561
ST->getAlign() <
25212562
getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
2522-
ST->getChain() != InChain)
2563+
// Ensure all store chains are the same (so they don't alias).
2564+
(StoresInChain && ST->getChain() != StoresInChain)
2565+
// Avoid:
2566+
// 1. Creating cyclic dependencies.
2567+
// 2. Expanding the node to a call within a call sequence.
2568+
|| !CanFoldStoreIntoFPLibCall(ST, Node))
25232569
continue;
25242570
ResultStores[ResNo] = ST;
2571+
StoresInChain = ST->getChain();
25252572
}
25262573

25272574
TargetLowering::ArgListTy Args;
@@ -2563,6 +2610,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25632610
Type *RetType = CallRetResNo.has_value()
25642611
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
25652612
: Type::getVoidTy(Ctx);
2613+
SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
25662614
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
25672615
TLI->getPointerTy(getDataLayout()));
25682616
TargetLowering::CallLoweringInfo CLI(*this);

llvm/test/CodeGen/AArch64/sincos-stack-slots.ll

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,37 @@ entry:
253253
store double %cos, ptr %out_cos, align 4
254254
ret void
255255
}
256+
257+
declare void @foo(ptr, ptr)
258+
259+
define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
260+
; CHECK-LABEL: can_fold_with_call_in_chain:
261+
; CHECK: // %bb.0: // %entry
262+
; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill
263+
; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill
264+
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
265+
; CHECK-NEXT: .cfi_def_cfa_offset 32
266+
; CHECK-NEXT: .cfi_offset w19, -8
267+
; CHECK-NEXT: .cfi_offset w20, -16
268+
; CHECK-NEXT: .cfi_offset w30, -24
269+
; CHECK-NEXT: .cfi_offset b8, -32
270+
; CHECK-NEXT: mov x19, x1
271+
; CHECK-NEXT: mov x20, x0
272+
; CHECK-NEXT: fmov s8, s0
273+
; CHECK-NEXT: bl foo
274+
; CHECK-NEXT: fmov s0, s8
275+
; CHECK-NEXT: mov x0, x20
276+
; CHECK-NEXT: mov x1, x19
277+
; CHECK-NEXT: bl sincosf
278+
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
279+
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
280+
; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload
281+
; CHECK-NEXT: ret
282+
entry:
283+
%sin = tail call float @llvm.sin.f32(float %x)
284+
%cos = tail call float @llvm.cos.f32(float %x)
285+
call void @foo(ptr %a, ptr %b)
286+
store float %sin, ptr %a, align 4
287+
store float %cos, ptr %b, align 4
288+
ret void
289+
}

llvm/test/CodeGen/PowerPC/f128-arith.ll

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
13651365
; CHECK-LABEL: qpFREXP:
13661366
; CHECK: # %bb.0: # %entry
13671367
; CHECK-NEXT: mflr r0
1368-
; CHECK-NEXT: .cfi_def_cfa_offset 64
1368+
; CHECK-NEXT: stdu r1, -32(r1)
1369+
; CHECK-NEXT: std r0, 48(r1)
1370+
; CHECK-NEXT: .cfi_def_cfa_offset 32
13691371
; CHECK-NEXT: .cfi_offset lr, 16
1370-
; CHECK-NEXT: .cfi_offset r30, -16
1371-
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
1372-
; CHECK-NEXT: stdu r1, -64(r1)
1373-
; CHECK-NEXT: std r0, 80(r1)
1374-
; CHECK-NEXT: addi r5, r1, 44
1375-
; CHECK-NEXT: mr r30, r4
13761372
; CHECK-NEXT: lxv v2, 0(r3)
1373+
; CHECK-NEXT: mr r5, r4
13771374
; CHECK-NEXT: bl frexpf128
13781375
; CHECK-NEXT: nop
1379-
; CHECK-NEXT: lwz r3, 44(r1)
1380-
; CHECK-NEXT: stw r3, 0(r30)
1381-
; CHECK-NEXT: addi r1, r1, 64
1376+
; CHECK-NEXT: addi r1, r1, 32
13821377
; CHECK-NEXT: ld r0, 16(r1)
1383-
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
13841378
; CHECK-NEXT: mtlr r0
13851379
; CHECK-NEXT: blr
13861380
;
13871381
; CHECK-P8-LABEL: qpFREXP:
13881382
; CHECK-P8: # %bb.0: # %entry
13891383
; CHECK-P8-NEXT: mflr r0
1390-
; CHECK-P8-NEXT: .cfi_def_cfa_offset 64
1384+
; CHECK-P8-NEXT: stdu r1, -32(r1)
1385+
; CHECK-P8-NEXT: std r0, 48(r1)
1386+
; CHECK-P8-NEXT: .cfi_def_cfa_offset 32
13911387
; CHECK-P8-NEXT: .cfi_offset lr, 16
1392-
; CHECK-P8-NEXT: .cfi_offset r30, -16
1393-
; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill
1394-
; CHECK-P8-NEXT: stdu r1, -64(r1)
1395-
; CHECK-P8-NEXT: std r0, 80(r1)
1396-
; CHECK-P8-NEXT: addi r5, r1, 44
1397-
; CHECK-P8-NEXT: mr r30, r4
13981388
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
1389+
; CHECK-P8-NEXT: mr r5, r4
13991390
; CHECK-P8-NEXT: xxswapd v2, vs0
14001391
; CHECK-P8-NEXT: bl frexpf128
14011392
; CHECK-P8-NEXT: nop
1402-
; CHECK-P8-NEXT: lwz r3, 44(r1)
1403-
; CHECK-P8-NEXT: stw r3, 0(r30)
1404-
; CHECK-P8-NEXT: addi r1, r1, 64
1393+
; CHECK-P8-NEXT: addi r1, r1, 32
14051394
; CHECK-P8-NEXT: ld r0, 16(r1)
1406-
; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
14071395
; CHECK-P8-NEXT: mtlr r0
14081396
; CHECK-P8-NEXT: blr
14091397
entry:

0 commit comments

Comments
 (0)