|
| 1 | +From a5ba56aadc91cc59bc8b00b77f42594d08fc31c5 Mon Sep 17 00:00:00 2001 |
| 2 | +From: nasmnc01 < [email protected]> |
| 3 | +Author: Scott Douglass < [email protected]> |
| 4 | +Date: Tue, 13 Aug 2024 10:55:51 +0100 |
| 5 | +Subject: [PATCH] [ARM][CodeGen]Prefer MEMCPY LDM/STM inlining for v7-m |
| 6 | + |
| 7 | +This patch changes the behaviour of memcpy inlining on v7m targets. |
| 8 | +The old behaviour was to inline memcpys with LDM/STM instructions. |
| 9 | +Alternatively, using LD/ST instructions for memcpy inlining allowed |
| 10 | +for performance gains of 1% to 2% on selected benchmarks. |
| 11 | + |
| 12 | +Co-authored-by: Nashe Mncube < [email protected]> |
| 13 | +--- |
| 14 | + llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 116 ++++++++++++++ |
| 15 | + llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 + |
| 16 | + llvm/lib/Target/ARM/ARMSubtarget.h | 4 + |
| 17 | + llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++ |
| 18 | + 4 files changed, 291 insertions(+) |
| 19 | + create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll |
| 20 | + |
| 21 | +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 22 | +index c57825949c1c..0913b2719813 100644 |
| 23 | +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 24 | ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 25 | +@@ -138,6 +138,118 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( |
| 26 | + return CallResult.second; |
| 27 | + } |
| 28 | + |
| 29 | ++SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt( |
| 30 | ++ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain, |
| 31 | ++ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile, |
| 32 | ++ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
| 33 | ++ // Do repeated batches of 4-byte loads and stores. |
| 34 | ++ unsigned BytesLeft = SizeVal & 3; |
| 35 | ++ unsigned NumMemOps = SizeVal >> 2; |
| 36 | ++ unsigned EmittedNumMemOps = 0; |
| 37 | ++ EVT VT = MVT::i32; |
| 38 | ++ unsigned VTSize = 4; |
| 39 | ++ unsigned I = 0; |
| 40 | ++ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers |
| 41 | ++ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6; |
| 42 | ++ SDValue TFOps[6]; |
| 43 | ++ SDValue Loads[6]; |
| 44 | ++ uint64_t SrcOff = 0, DstOff = 0; |
| 45 | ++ |
| 46 | ++ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone; |
| 47 | ++ if (isVolatile) |
| 48 | ++ MOFlags = MachineMemOperand::Flags::MOVolatile; |
| 49 | ++ MachineMemOperand::Flags LoadMOFlags = MOFlags; |
| 50 | ++ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), |
| 51 | ++ DAG.getDataLayout())) |
| 52 | ++ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable; |
| 53 | ++ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>()) |
| 54 | ++ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant()) |
| 55 | ++ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant; |
| 56 | ++ MachineMemOperand::Flags StoreMOFlags = MOFlags; |
| 57 | ++ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), |
| 58 | ++ DAG.getDataLayout())) |
| 59 | ++ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable; |
| 60 | ++ |
| 61 | ++ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the |
| 62 | ++ // same number of stores. The loads and stores may get combined into |
| 63 | ++ // ldm/stm later on. |
| 64 | ++ while (EmittedNumMemOps < NumMemOps) { |
| 65 | ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { |
| 66 | ++ Loads[I] = DAG.getLoad(VT, dl, Chain, |
| 67 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, |
| 68 | ++ DAG.getConstant(SrcOff, dl, MVT::i32)), |
| 69 | ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), |
| 70 | ++ LoadMOFlags); |
| 71 | ++ TFOps[I] = Loads[I].getValue(1); |
| 72 | ++ SrcOff += VTSize; |
| 73 | ++ } |
| 74 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); |
| 75 | ++ |
| 76 | ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { |
| 77 | ++ TFOps[I] = DAG.getStore( |
| 78 | ++ Chain, dl, Loads[I], |
| 79 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, |
| 80 | ++ DAG.getConstant(DstOff, dl, MVT::i32)), |
| 81 | ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags); |
| 82 | ++ DstOff += VTSize; |
| 83 | ++ } |
| 84 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); |
| 85 | ++ |
| 86 | ++ EmittedNumMemOps += I; |
| 87 | ++ } |
| 88 | ++ |
| 89 | ++ if (BytesLeft == 0) |
| 90 | ++ return Chain; |
| 91 | ++ |
| 92 | ++ // Issue loads / stores for the trailing (1 - 3) bytes. |
| 93 | ++ unsigned BytesLeftSave = BytesLeft; |
| 94 | ++ I = 0; |
| 95 | ++ while (BytesLeft) { |
| 96 | ++ if (BytesLeft >= 2) { |
| 97 | ++ VT = MVT::i16; |
| 98 | ++ VTSize = 2; |
| 99 | ++ } else { |
| 100 | ++ VT = MVT::i8; |
| 101 | ++ VTSize = 1; |
| 102 | ++ } |
| 103 | ++ |
| 104 | ++ Loads[I] = DAG.getLoad(VT, dl, Chain, |
| 105 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, |
| 106 | ++ DAG.getConstant(SrcOff, dl, MVT::i32)), |
| 107 | ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), |
| 108 | ++ LoadMOFlags); |
| 109 | ++ |
| 110 | ++ TFOps[I] = Loads[I].getValue(1); |
| 111 | ++ ++I; |
| 112 | ++ SrcOff += VTSize; |
| 113 | ++ BytesLeft -= VTSize; |
| 114 | ++ } |
| 115 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); |
| 116 | ++ |
| 117 | ++ I = 0; |
| 118 | ++ BytesLeft = BytesLeftSave; |
| 119 | ++ while (BytesLeft) { |
| 120 | ++ if (BytesLeft >= 2) { |
| 121 | ++ VT = MVT::i16; |
| 122 | ++ VTSize = 2; |
| 123 | ++ } else { |
| 124 | ++ VT = MVT::i8; |
| 125 | ++ VTSize = 1; |
| 126 | ++ } |
| 127 | ++ |
| 128 | ++ TFOps[I] = DAG.getStore(Chain, dl, Loads[I], |
| 129 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, |
| 130 | ++ DAG.getConstant(DstOff, dl, MVT::i32)), |
| 131 | ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), |
| 132 | ++ StoreMOFlags); |
| 133 | ++ ++I; |
| 134 | ++ DstOff += VTSize; |
| 135 | ++ BytesLeft -= VTSize; |
| 136 | ++ } |
| 137 | ++ |
| 138 | ++ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); |
| 139 | ++} |
| 140 | ++ |
| 141 | + static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, |
| 142 | + const SelectionDAG &DAG, |
| 143 | + ConstantSDNode *ConstantSize, |
| 144 | +@@ -192,6 +304,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( |
| 145 | + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
| 146 | + Alignment.value(), RTLIB::MEMCPY); |
| 147 | + |
| 148 | ++ if (Subtarget.allowInlineMemcpyAsLdSt()) |
| 149 | ++ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal, |
| 150 | ++ isVolatile, DstPtrInfo, SrcPtrInfo); |
| 151 | ++ |
| 152 | + unsigned BytesLeft = SizeVal & 3; |
| 153 | + unsigned NumMemOps = SizeVal >> 2; |
| 154 | + unsigned EmittedNumMemOps = 0; |
| 155 | +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 156 | +index 275b1c0f8dc0..6ff422c15b12 100644 |
| 157 | +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 158 | ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 159 | +@@ -44,6 +44,12 @@ public: |
| 160 | + MachinePointerInfo DstPtrInfo, |
| 161 | + MachinePointerInfo SrcPtrInfo) const override; |
| 162 | + |
| 163 | ++ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl, |
| 164 | ++ const ARMSubtarget &Subtarget, SDValue Chain, |
| 165 | ++ SDValue Dst, SDValue Src, uint64_t SizeVal, |
| 166 | ++ bool isVolatile, MachinePointerInfo DstPtrInfo, |
| 167 | ++ MachinePointerInfo SrcPtrInfo) const; |
| 168 | ++ |
| 169 | + SDValue |
| 170 | + EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, |
| 171 | + SDValue Dst, SDValue Src, SDValue Size, |
| 172 | +diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h |
| 173 | +index 2f7af05a259f..20aa9e4f334b 100644 |
| 174 | +--- a/llvm/lib/Target/ARM/ARMSubtarget.h |
| 175 | ++++ b/llvm/lib/Target/ARM/ARMSubtarget.h |
| 176 | +@@ -523,6 +523,10 @@ public: |
| 177 | + bool ignoreCSRForAllocationOrder(const MachineFunction &MF, |
| 178 | + unsigned PhysReg) const override; |
| 179 | + unsigned getGPRAllocationOrder(const MachineFunction &MF) const; |
| 180 | ++ |
| 181 | ++ bool allowInlineMemcpyAsLdSt() const { |
| 182 | ++ return HasV7Ops && ARMProcClass == MClass; |
| 183 | ++ } |
| 184 | + }; |
| 185 | + |
| 186 | + } // end namespace llvm |
| 187 | +-- |
| 188 | +2.34.1 |
| 189 | + |
0 commit comments