diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 053e9d14dc2f7..58ac87206b9a6 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3520,6 +3520,10 @@ class TargetLoweringBase { /// The default implementation just freezes the set of reserved registers. virtual void finalizeLowering(MachineFunction &MF) const; + /// Returns true if it's profitable to allow merging store of loads when there + /// are functions calls between the load and the store. + virtual bool shouldMergeStoreOfLoadsOverCall(EVT, EVT) const { return true; } + //===----------------------------------------------------------------------===// // GlobalISel Hooks //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a54857e1037e2..cd2a7fc3f9ab6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -792,6 +792,10 @@ namespace { SmallVectorImpl &StoreNodes, unsigned NumStores, SDNode *RootNode); + /// Helper function for tryStoreMergeOfLoads. Checks if the load/store + /// chain has a call in it. \return True if a call is found. + bool hasCallInLdStChain(StoreSDNode *St, LoadSDNode *Ld); + /// This is a helper function for mergeConsecutiveStores. Given a list of /// store candidates, find the first N that are consecutive in memory. /// Returns 0 if there are not at least 2 consecutive stores to try merging. @@ -21152,6 +21156,38 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies( return true; } +bool DAGCombiner::hasCallInLdStChain(StoreSDNode *St, LoadSDNode *Ld) { + SmallPtrSet Visited; + SmallVector, 8> Worklist; + Worklist.emplace_back(St->getChain().getNode(), false); + + while (!Worklist.empty()) { + auto [Node, FoundCall] = Worklist.pop_back_val(); + if (!Visited.insert(Node).second || Node->getNumOperands() == 0) + continue; + + switch (Node->getOpcode()) { + case ISD::CALLSEQ_END: + Worklist.emplace_back(Node->getOperand(0).getNode(), true); + break; + case ISD::TokenFactor: + for (SDValue Op : Node->ops()) + Worklist.emplace_back(Op.getNode(), FoundCall); + break; + case ISD::LOAD: + if (Node == Ld) + return FoundCall; + [[fallthrough]]; + default: + assert(Node->getOperand(0).getValueType() == MVT::Other && + "Invalid chain type"); + Worklist.emplace_back(Node->getOperand(0).getNode(), FoundCall); + break; + } + } + return false; +} + unsigned DAGCombiner::getConsecutiveStores(SmallVectorImpl &StoreNodes, int64_t ElementSizeBytes) const { @@ -21598,6 +21634,16 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl &StoreNodes, JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); } + // Check if there is a call in the load/store chain. + if (!TLI.shouldMergeStoreOfLoadsOverCall(MemVT, JointMemOpVT) && + hasCallInLdStChain(cast(StoreNodes[0].MemNode), + cast(LoadNodes[0].MemNode))) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); + NumConsecutiveStores -= NumElem; + continue; + } + SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ffbc14a29006c..4072f8ea2fc67 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -1070,6 +1070,13 @@ class RISCVTargetLowering : public TargetLowering { return false; } + /// Disables storing and loading vectors by default when there are function + /// calls between the load and store, since these are more expensive than just + /// using scalars + bool shouldMergeStoreOfLoadsOverCall(EVT SrcVT, EVT MergedVT) const override { + return !MergedVT.isVector() || SrcVT.isVector(); + } + /// For available scheduling models FDIV + two independent FMULs are much /// faster than two FDIVs. unsigned combineRepeatedFPDivisors() const override; diff --git a/llvm/test/CodeGen/RISCV/rvv/stores-of-loads-merging.ll b/llvm/test/CodeGen/RISCV/rvv/stores-of-loads-merging.ll index 7699bb1ae1371..5bf8af262e73b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/stores-of-loads-merging.ll +++ b/llvm/test/CodeGen/RISCV/rvv/stores-of-loads-merging.ll @@ -2,11 +2,8 @@ ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+b | FileCheck %s --check-prefixes=CHECK,V ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+b,+zvfh | FileCheck %s --check-prefixes=CHECK,ZVFH -declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) declare void @g() -; TODO: Merging scalars into vectors is unprofitable because we have no -; vector CSRs which creates additional spills around the call. define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: @@ -16,40 +13,40 @@ define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) { ; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: csrr a6, vlenb -; CHECK-NEXT: sub sp, sp, a6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb +; CHECK-NEXT: .cfi_offset s3, -40 +; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: mv s0, a5 ; CHECK-NEXT: mv s1, a4 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vse64.v v8, (a1) -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: ld s3, 0(a2) +; CHECK-NEXT: ld s4, 8(a2) ; CHECK-NEXT: mv s2, a3 ; CHECK-NEXT: call g -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: sd s3, 0(s2) +; CHECK-NEXT: sd s4, 8(s2) ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vse64.v v8, (s2) ; CHECK-NEXT: vle64.v v8, (s1) ; CHECK-NEXT: vse64.v v8, (s0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 48 ; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: .cfi_restore ra ; CHECK-NEXT: .cfi_restore s0 ; CHECK-NEXT: .cfi_restore s1 ; CHECK-NEXT: .cfi_restore s2 +; CHECK-NEXT: .cfi_restore s3 +; CHECK-NEXT: .cfi_restore s4 ; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret @@ -78,13 +75,13 @@ define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) { ret void } -define void @f1(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) { +define void @f1(ptr %p, ptr %q, double %t) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a2) +; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: fcvt.wu.d a0, fa0, rtz -; CHECK-NEXT: vse64.v v8, (a3) +; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %x0 = load i64, ptr %p %p.1 = getelementptr i64, ptr %p, i64 1 @@ -93,7 +90,6 @@ define void @f1(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) { store i64 %x0, ptr %q %q.1 = getelementptr i64, ptr %q, i64 1 store i64 %x1, ptr %q.1 - ret void } @@ -515,28 +511,26 @@ define void @two_half_unaligned(ptr %p, ptr %q) { ; ZVFH-NEXT: .cfi_def_cfa_offset 32 ; ZVFH-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; ZVFH-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; ZVFH-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill +; ZVFH-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill ; ZVFH-NEXT: .cfi_offset ra, -8 ; ZVFH-NEXT: .cfi_offset s0, -16 -; ZVFH-NEXT: csrr a2, vlenb -; ZVFH-NEXT: sub sp, sp, a2 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: .cfi_offset fs0, -24 +; ZVFH-NEXT: .cfi_offset fs1, -32 +; ZVFH-NEXT: flh fs0, 0(a0) +; ZVFH-NEXT: flh fs1, 2(a0) ; ZVFH-NEXT: mv s0, a1 ; ZVFH-NEXT: call g -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vse16.v v8, (s0) -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: add sp, sp, a0 -; ZVFH-NEXT: .cfi_def_cfa sp, 32 +; ZVFH-NEXT: fsh fs0, 0(s0) +; ZVFH-NEXT: fsh fs1, 2(s0) ; ZVFH-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; ZVFH-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; ZVFH-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload +; ZVFH-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload ; ZVFH-NEXT: .cfi_restore ra ; ZVFH-NEXT: .cfi_restore s0 +; ZVFH-NEXT: .cfi_restore fs0 +; ZVFH-NEXT: .cfi_restore fs1 ; ZVFH-NEXT: addi sp, sp, 32 ; ZVFH-NEXT: .cfi_def_cfa_offset 0 ; ZVFH-NEXT: ret @@ -552,9 +546,6 @@ define void @two_half_unaligned(ptr %p, ptr %q) { ret void } - -; TODO: This one is currently a vector which is unprofitable, we should -; use i64 instead. define void @two_float(ptr %p, ptr %q) { ; CHECK-LABEL: two_float: ; CHECK: # %bb.0: @@ -598,28 +589,26 @@ define void @two_float_unaligned(ptr %p, ptr %q) { ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_offset fs0, -24 +; CHECK-NEXT: .cfi_offset fs1, -32 +; CHECK-NEXT: flw fs0, 0(a0) +; CHECK-NEXT: flw fs1, 4(a0) ; CHECK-NEXT: mv s0, a1 ; CHECK-NEXT: call g -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vse32.v v8, (s0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 32 +; CHECK-NEXT: fsw fs0, 0(s0) +; CHECK-NEXT: fsw fs1, 4(s0) ; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: .cfi_restore ra ; CHECK-NEXT: .cfi_restore s0 +; CHECK-NEXT: .cfi_restore fs0 +; CHECK-NEXT: .cfi_restore fs1 ; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret @@ -679,28 +668,26 @@ define void @two_double(ptr %p, ptr %q) { ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_offset fs0, -24 +; CHECK-NEXT: .cfi_offset fs1, -32 +; CHECK-NEXT: fld fs0, 0(a0) +; CHECK-NEXT: fld fs1, 8(a0) ; CHECK-NEXT: mv s0, a1 ; CHECK-NEXT: call g -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vse64.v v8, (s0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 32 +; CHECK-NEXT: fsd fs0, 0(s0) +; CHECK-NEXT: fsd fs1, 8(s0) ; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: .cfi_restore ra ; CHECK-NEXT: .cfi_restore s0 +; CHECK-NEXT: .cfi_restore fs0 +; CHECK-NEXT: .cfi_restore fs1 ; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret