|
| 1 | +From 61af6af10d10a08b81d3924fa5b35bfb548b2a05 Mon Sep 17 00:00:00 2001 |
| 2 | +From: nasmnc01 < [email protected]> |
| 3 | +Author: Scott Douglass < [email protected]> |
| 4 | +Date: Tue, 13 Aug 2024 10:55:51 +0100 |
| 5 | +Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m |
| 6 | + |
| 7 | +This patch changes the behaviour of memcpy inlining on v7m targets. |
| 8 | +The old behaviour was to inline memcpys with LDM/STM instructions. |
| 9 | +Alternatively, using LD/ST instructions for memcpy inlining allowed |
| 10 | +for performance gains of 1% to 2% on selected benchmarks. |
| 11 | + |
| 12 | +Co-authored-by: Nashe Mncube < [email protected]> |
| 13 | +--- |
| 14 | + llvm/lib/Target/ARM/ARMFeatures.td | 5 + |
| 15 | + llvm/lib/Target/ARM/ARMProcessors.td | 2 +- |
| 16 | + llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++ |
| 17 | + llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 + |
| 18 | + llvm/lib/Target/ARM/ARMSubtarget.h | 2 + |
| 19 | + llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++ |
| 20 | + 6 files changed, 300 insertions(+), 1 deletion(-) |
| 21 | + create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll |
| 22 | + |
| 23 | +diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td |
| 24 | +index bb437698296c..f7fa00aba424 100644 |
| 25 | +--- a/llvm/lib/Target/ARM/ARMFeatures.td |
| 26 | ++++ b/llvm/lib/Target/ARM/ARMFeatures.td |
| 27 | +@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", |
| 28 | + "DisablePostRAScheduler", "true", |
| 29 | + "Don't schedule again after register allocation">; |
| 30 | + |
| 31 | ++def FeatureUseInlineMemcpyAsLdSt : |
| 32 | ++ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt", |
| 33 | ++ "true", "Use memcpy inlining as LD/ST instructions">; |
| 34 | ++ |
| 35 | ++ |
| 36 | + // Armv8.5-A extensions |
| 37 | + |
| 38 | + // Has speculation barrier. |
| 39 | +diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td |
| 40 | +index b94a5fc16146..ffb0c86bc687 100644 |
| 41 | +--- a/llvm/lib/Target/ARM/ARMProcessors.td |
| 42 | ++++ b/llvm/lib/Target/ARM/ARMProcessors.td |
| 43 | +@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus", |
| 44 | + def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", |
| 45 | + "Cortex-M3 ARM processors", []>; |
| 46 | + def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", |
| 47 | +- "Cortex-M7 ARM processors", []>; |
| 48 | ++ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>; |
| 49 | + |
| 50 | + //===----------------------------------------------------------------------===// |
| 51 | + // ARM processors |
| 52 | +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 53 | +index c57825949c1c..12db2ab1fca2 100644 |
| 54 | +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 55 | ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 56 | +@@ -12,6 +12,7 @@ |
| 57 | + |
| 58 | + #include "ARMTargetMachine.h" |
| 59 | + #include "ARMTargetTransformInfo.h" |
| 60 | ++#include "llvm/ADT/SmallVector.h" |
| 61 | + #include "llvm/CodeGen/SelectionDAG.h" |
| 62 | + #include "llvm/IR/DerivedTypes.h" |
| 63 | + #include "llvm/Support/CommandLine.h" |
| 64 | +@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( |
| 65 | + return CallResult.second; |
| 66 | + } |
| 67 | + |
| 68 | ++SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt( |
| 69 | ++ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain, |
| 70 | ++ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile, |
| 71 | ++ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
| 72 | ++ // Do repeated batches of 4-byte loads and stores. |
| 73 | ++ unsigned BytesLeft = SizeVal & 3; |
| 74 | ++ unsigned NumMemOps = SizeVal >> 2; |
| 75 | ++ unsigned EmittedNumMemOps = 0; |
| 76 | ++ EVT VT = MVT::i32; |
| 77 | ++ unsigned VTSize = 4; |
| 78 | ++ unsigned I = 0; |
| 79 | ++ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers |
| 80 | ++ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6; |
| 81 | ++ SmallVector<SDValue> TFOps(6); |
| 82 | ++ SmallVector<SDValue> Loads(6); |
| 83 | ++ uint64_t SrcOff = 0, DstOff = 0; |
| 84 | ++ |
| 85 | ++ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone; |
| 86 | ++ if (isVolatile) |
| 87 | ++ MOFlags = MachineMemOperand::Flags::MOVolatile; |
| 88 | ++ MachineMemOperand::Flags LoadMOFlags = MOFlags; |
| 89 | ++ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), |
| 90 | ++ DAG.getDataLayout())) |
| 91 | ++ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable; |
| 92 | ++ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>()) |
| 93 | ++ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant()) |
| 94 | ++ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant; |
| 95 | ++ MachineMemOperand::Flags StoreMOFlags = MOFlags; |
| 96 | ++ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), |
| 97 | ++ DAG.getDataLayout())) |
| 98 | ++ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable; |
| 99 | ++ |
| 100 | ++ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the |
| 101 | ++ // same number of stores. The loads and stores may get combined into |
| 102 | ++ // ldm/stm later on. |
| 103 | ++ while (EmittedNumMemOps < NumMemOps) { |
| 104 | ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { |
| 105 | ++ Loads[I] = DAG.getLoad(VT, dl, Chain, |
| 106 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, |
| 107 | ++ DAG.getConstant(SrcOff, dl, MVT::i32)), |
| 108 | ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), |
| 109 | ++ LoadMOFlags); |
| 110 | ++ TFOps[I] = Loads[I].getValue(1); |
| 111 | ++ SrcOff += VTSize; |
| 112 | ++ } |
| 113 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, |
| 114 | ++ ArrayRef(TFOps.data(), I)); |
| 115 | ++ |
| 116 | ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { |
| 117 | ++ TFOps[I] = DAG.getStore( |
| 118 | ++ Chain, dl, Loads[I], |
| 119 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, |
| 120 | ++ DAG.getConstant(DstOff, dl, MVT::i32)), |
| 121 | ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags); |
| 122 | ++ DstOff += VTSize; |
| 123 | ++ } |
| 124 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, |
| 125 | ++ ArrayRef(TFOps.data(), I)); |
| 126 | ++ |
| 127 | ++ EmittedNumMemOps += I; |
| 128 | ++ } |
| 129 | ++ |
| 130 | ++ if (BytesLeft == 0) |
| 131 | ++ return Chain; |
| 132 | ++ |
| 133 | ++ // Issue loads / stores for the trailing (1 - 3) bytes. |
| 134 | ++ unsigned BytesLeftSave = BytesLeft; |
| 135 | ++ I = 0; |
| 136 | ++ while (BytesLeft) { |
| 137 | ++ if (BytesLeft >= 2) { |
| 138 | ++ VT = MVT::i16; |
| 139 | ++ VTSize = 2; |
| 140 | ++ } else { |
| 141 | ++ VT = MVT::i8; |
| 142 | ++ VTSize = 1; |
| 143 | ++ } |
| 144 | ++ |
| 145 | ++ Loads[I] = DAG.getLoad(VT, dl, Chain, |
| 146 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, |
| 147 | ++ DAG.getConstant(SrcOff, dl, MVT::i32)), |
| 148 | ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), |
| 149 | ++ LoadMOFlags); |
| 150 | ++ |
| 151 | ++ TFOps[I] = Loads[I].getValue(1); |
| 152 | ++ ++I; |
| 153 | ++ SrcOff += VTSize; |
| 154 | ++ BytesLeft -= VTSize; |
| 155 | ++ } |
| 156 | ++ Chain = |
| 157 | ++ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I)); |
| 158 | ++ |
| 159 | ++ I = 0; |
| 160 | ++ BytesLeft = BytesLeftSave; |
| 161 | ++ while (BytesLeft) { |
| 162 | ++ if (BytesLeft >= 2) { |
| 163 | ++ VT = MVT::i16; |
| 164 | ++ VTSize = 2; |
| 165 | ++ } else { |
| 166 | ++ VT = MVT::i8; |
| 167 | ++ VTSize = 1; |
| 168 | ++ } |
| 169 | ++ |
| 170 | ++ TFOps[I] = DAG.getStore(Chain, dl, Loads[I], |
| 171 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, |
| 172 | ++ DAG.getConstant(DstOff, dl, MVT::i32)), |
| 173 | ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), |
| 174 | ++ StoreMOFlags); |
| 175 | ++ ++I; |
| 176 | ++ DstOff += VTSize; |
| 177 | ++ BytesLeft -= VTSize; |
| 178 | ++ } |
| 179 | ++ |
| 180 | ++ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, |
| 181 | ++ ArrayRef(TFOps.data(), I)); |
| 182 | ++} |
| 183 | ++ |
| 184 | + static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, |
| 185 | + const SelectionDAG &DAG, |
| 186 | + ConstantSDNode *ConstantSize, |
| 187 | +@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( |
| 188 | + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
| 189 | + Alignment.value(), RTLIB::MEMCPY); |
| 190 | + |
| 191 | ++ if (Subtarget.UseInlineMemcpyAsLdSt) |
| 192 | ++ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal, |
| 193 | ++ isVolatile, DstPtrInfo, SrcPtrInfo); |
| 194 | ++ |
| 195 | + unsigned BytesLeft = SizeVal & 3; |
| 196 | + unsigned NumMemOps = SizeVal >> 2; |
| 197 | + unsigned EmittedNumMemOps = 0; |
| 198 | +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 199 | +index 275b1c0f8dc0..6ff422c15b12 100644 |
| 200 | +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 201 | ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 202 | +@@ -44,6 +44,12 @@ public: |
| 203 | + MachinePointerInfo DstPtrInfo, |
| 204 | + MachinePointerInfo SrcPtrInfo) const override; |
| 205 | + |
| 206 | ++ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl, |
| 207 | ++ const ARMSubtarget &Subtarget, SDValue Chain, |
| 208 | ++ SDValue Dst, SDValue Src, uint64_t SizeVal, |
| 209 | ++ bool isVolatile, MachinePointerInfo DstPtrInfo, |
| 210 | ++ MachinePointerInfo SrcPtrInfo) const; |
| 211 | ++ |
| 212 | + SDValue |
| 213 | + EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, |
| 214 | + SDValue Dst, SDValue Src, SDValue Size, |
| 215 | +-- |
| 216 | +2.34.1 |
| 217 | + |
0 commit comments