Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3e7e708
[SelectionDAG] Avoid store merging across function calls
mikhailramalho Feb 28, 2025
5b1fc65
Check call_end instead of start
mikhailramalho Mar 4, 2025
be40ec2
Only walk over mem operation chains
mikhailramalho Mar 4, 2025
c3298ff
Don't go over NumConsecutiveStores
mikhailramalho Mar 4, 2025
b56d1b1
Use SDValues
mikhailramalho Mar 4, 2025
82420c7
Added fallthrough
mikhailramalho Mar 4, 2025
96c8e53
Add Visited list to cache the walk
mikhailramalho Mar 5, 2025
3574370
Moved increment
mikhailramalho Mar 5, 2025
f9393d5
Updated test case
mikhailramalho Mar 5, 2025
d86ec01
Enable merge by default for scalars
mikhailramalho Mar 5, 2025
04bca6d
Rewrite walk back algo to keep track of calls found
mikhailramalho Mar 5, 2025
f27092f
Check final type before we prevent merges
mikhailramalho Mar 6, 2025
9faa629
No need to check operands. It's checked in the start of the loop
mikhailramalho Mar 17, 2025
b326da1
Assert operand type
mikhailramalho Mar 17, 2025
c858020
Moved peekThroughBitcasts into an assertion
mikhailramalho Mar 17, 2025
b6b1521
Use getChain instead of accessing the operand 0
mikhailramalho Mar 17, 2025
18e68ea
Make hasCallInLdStChain a member function
mikhailramalho Mar 18, 2025
3bc2b22
Added test case
mikhailramalho Mar 18, 2025
816a235
Merge remote-tracking branch 'origin/main' into dag-spillcost-fix
mikhailramalho Mar 19, 2025
904641f
Removed duplicated test after merge
mikhailramalho Mar 19, 2025
a255f16
No need to declare intrinsics anymore
mikhailramalho Mar 19, 2025
75f4caa
Removed unused args
mikhailramalho Mar 19, 2025
de96633
Address comment
mikhailramalho Mar 19, 2025
69c361c
Address comment
mikhailramalho Mar 19, 2025
0dfd354
Removed todo
mikhailramalho Mar 19, 2025
e73c49d
Simplify interface
mikhailramalho Mar 19, 2025
a88e73b
Merge remote-tracking branch 'origin/main' into dag-spillcost-fix
mikhailramalho Mar 19, 2025
d6c848d
Remove assert that fails when building blender_r
mikhailramalho Mar 20, 2025
0189f30
Address comment
mikhailramalho Mar 20, 2025
99e11ae
Merge remote-tracking branch 'origin/main' into dag-spillcost-fix
mikhailramalho Mar 20, 2025
67b3b65
Update test
mikhailramalho Mar 21, 2025
ed8a5fd
Removed todo
mikhailramalho Mar 21, 2025
21ba7b2
Merge remote-tracking branch 'origin/main' into dag-spillcost-fix
mikhailramalho Mar 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3506,6 +3506,10 @@ class TargetLoweringBase {
/// The default implementation just freezes the set of reserved registers.
virtual void finalizeLowering(MachineFunction &MF) const;

/// Returns true if it's profitable to allow merging store of loads when there
/// are functions calls between the load and the store.
virtual bool shouldMergeStoreOfLoadsOverCall(EVT) const { return true; }

//===----------------------------------------------------------------------===//
// GlobalISel Hooks
//===----------------------------------------------------------------------===//
Expand Down
50 changes: 50 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21553,6 +21553,56 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
}

auto HasCallInLdStChain = [](SmallVectorImpl<MemOpLink> &StoreNodes,
SmallVectorImpl<MemOpLink> &LoadNodes,
unsigned NumStores) {
for (unsigned i = 0; i < NumStores; ++i) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
SDValue Val = peekThroughBitcasts(St->getValue());
LoadSDNode *Ld = cast<LoadSDNode>(Val);
assert(Ld == LoadNodes[i].MemNode && "Load and store mismatch");

SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<std::pair<const SDNode *, bool>, 8> Worklist;
Worklist.emplace_back(St->getOperand(0).getNode(), false);

while (!Worklist.empty()) {
auto [Node, FoundCall] = Worklist.pop_back_val();
if (!Visited.insert(Node).second || Node->getNumOperands() == 0)
continue;

switch (Node->getOpcode()) {
case ISD::CALLSEQ_END:
Worklist.emplace_back(Node->getOperand(0).getNode(), true);
break;
case ISD::TokenFactor:
for (SDValue Op : Node->ops())
Worklist.emplace_back(Op.getNode(), FoundCall);
break;
case ISD::LOAD:
if (Node == Ld)
return FoundCall;
[[fallthrough]];
default:
if (Node->getNumOperands() > 0)
Worklist.emplace_back(Node->getOperand(0).getNode(), FoundCall);
break;
}
}
return false;
}
return false;
};

// Check if there is a call in the load/store chain.
if (!TLI.shouldMergeStoreOfLoadsOverCall(JointMemOpVT) &&
HasCallInLdStChain(StoreNodes, LoadNodes, NumElem)) {
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
NumConsecutiveStores -= NumElem;
continue;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed there is a better integration here. The way you have this phrased, we either can merge to the vector type, or we don't merge at all.

I think what you actually want to do here is to add a couple parameters to the existing canMergeStoresTo interface. If you add both the SrcVT, and a "IsOverCall" boolean, then that interface returning the answer suggested above in the IsOverCall case will result in a wider scalar type being chosen if one is available.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though, I'll note I'm also fine in this being a follow up change. I'm happy to pick that up if you want to focus on just getting this in.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm happy with either option: to land this PR and do a follow-up patch, or work on this one. Up to you.

Also, I did a couple of tests with your first suggestion on checking for CSR for a given type in mikhailramalho@df27cb2, should I give up on that approach? There are still a couple of crashes I need to debug.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's go for minimum viable patch. Update the existing TLI hook you added with the source parameter, let's get that landed, and then iterate.


SDLoc LoadDL(LoadNodes[0].MemNode);
SDLoc StoreDL(StoreNodes[0].MemNode);

Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,13 @@ class RISCVTargetLowering : public TargetLowering {
return false;
}

/// Disables storing and loading vectors by default when there are function
/// calls between the load and store, since these are more expensive than just
/// using scalars
bool shouldMergeStoreOfLoadsOverCall(EVT VT) const override {
return VT.isScalarInteger();
}

/// For available scheduling models FDIV + two independent FMULs are much
/// faster than two FDIVs.
unsigned combineRepeatedFPDivisors() const override;
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/RISCV/stores-of-loads-merging.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,40 +13,40 @@ define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
; CHECK-NEXT: .cfi_offset s1, -24
; CHECK-NEXT: .cfi_offset s2, -32
; CHECK-NEXT: csrr a6, vlenb
; CHECK-NEXT: sub sp, sp, a6
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
; CHECK-NEXT: .cfi_offset s3, -40
; CHECK-NEXT: .cfi_offset s4, -48
; CHECK-NEXT: mv s0, a5
; CHECK-NEXT: mv s1, a4
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: ld s3, 0(a2)
; CHECK-NEXT: ld s4, 8(a2)
; CHECK-NEXT: mv s2, a3
; CHECK-NEXT: call g
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: sd s3, 0(s2)
; CHECK-NEXT: sd s4, 8(s2)
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vse64.v v8, (s2)
; CHECK-NEXT: vle64.v v8, (s1)
; CHECK-NEXT: vse64.v v8, (s0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 48
; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s3, 8(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s4, 0(sp) # 8-byte Folded Reload
; CHECK-NEXT: .cfi_restore ra
; CHECK-NEXT: .cfi_restore s0
; CHECK-NEXT: .cfi_restore s1
; CHECK-NEXT: .cfi_restore s2
; CHECK-NEXT: .cfi_restore s3
; CHECK-NEXT: .cfi_restore s4
; CHECK-NEXT: addi sp, sp, 48
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
Expand Down
Loading