Skip to content

Commit cd41a86

Browse files
authored
Patch file for alternative MEMCPY LDM/STM inlining for Cortex m7 (#548)
Performance improvements have been seen of around 1 to 2% on selected benchmarks when LD/ST inlining is preferred over LDM/STM for Cortex m7. This adds a patch file that enables this optimisation.
1 parent 11f7cf9 commit cd41a86

File tree

1 file changed

+217
-0
lines changed

1 file changed

+217
-0
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
From 61af6af10d10a08b81d3924fa5b35bfb548b2a05 Mon Sep 17 00:00:00 2001
2+
From: nasmnc01 <[email protected]>
3+
Author: Scott Douglass <[email protected]>
4+
Date: Tue, 13 Aug 2024 10:55:51 +0100
5+
Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m
6+
7+
This patch changes the behaviour of memcpy inlining on v7m targets.
8+
The old behaviour was to inline memcpys with LDM/STM instructions.
9+
Alternatively, using LD/ST instructions for memcpy inlining allowed
10+
for performance gains of 1% to 2% on selected benchmarks.
11+
12+
Co-authored-by: Nashe Mncube <[email protected]>
13+
---
14+
llvm/lib/Target/ARM/ARMFeatures.td | 5 +
15+
llvm/lib/Target/ARM/ARMProcessors.td | 2 +-
16+
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++
17+
llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 +
18+
llvm/lib/Target/ARM/ARMSubtarget.h | 2 +
19+
llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++
20+
6 files changed, 300 insertions(+), 1 deletion(-)
21+
create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll
22+
23+
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
24+
index bb437698296c..f7fa00aba424 100644
25+
--- a/llvm/lib/Target/ARM/ARMFeatures.td
26+
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
27+
@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
28+
"DisablePostRAScheduler", "true",
29+
"Don't schedule again after register allocation">;
30+
31+
+def FeatureUseInlineMemcpyAsLdSt :
32+
+ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt",
33+
+ "true", "Use memcpy inlining as LD/ST instructions">;
34+
+
35+
+
36+
// Armv8.5-A extensions
37+
38+
// Has speculation barrier.
39+
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
40+
index b94a5fc16146..ffb0c86bc687 100644
41+
--- a/llvm/lib/Target/ARM/ARMProcessors.td
42+
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
43+
@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus",
44+
def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
45+
"Cortex-M3 ARM processors", []>;
46+
def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
47+
- "Cortex-M7 ARM processors", []>;
48+
+ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>;
49+
50+
//===----------------------------------------------------------------------===//
51+
// ARM processors
52+
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
53+
index c57825949c1c..12db2ab1fca2 100644
54+
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
55+
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
56+
@@ -12,6 +12,7 @@
57+
58+
#include "ARMTargetMachine.h"
59+
#include "ARMTargetTransformInfo.h"
60+
+#include "llvm/ADT/SmallVector.h"
61+
#include "llvm/CodeGen/SelectionDAG.h"
62+
#include "llvm/IR/DerivedTypes.h"
63+
#include "llvm/Support/CommandLine.h"
64+
@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
65+
return CallResult.second;
66+
}
67+
68+
+SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
69+
+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
70+
+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
71+
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
72+
+ // Do repeated batches of 4-byte loads and stores.
73+
+ unsigned BytesLeft = SizeVal & 3;
74+
+ unsigned NumMemOps = SizeVal >> 2;
75+
+ unsigned EmittedNumMemOps = 0;
76+
+ EVT VT = MVT::i32;
77+
+ unsigned VTSize = 4;
78+
+ unsigned I = 0;
79+
+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
80+
+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
81+
+ SmallVector<SDValue> TFOps(6);
82+
+ SmallVector<SDValue> Loads(6);
83+
+ uint64_t SrcOff = 0, DstOff = 0;
84+
+
85+
+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
86+
+ if (isVolatile)
87+
+ MOFlags = MachineMemOperand::Flags::MOVolatile;
88+
+ MachineMemOperand::Flags LoadMOFlags = MOFlags;
89+
+ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
90+
+ DAG.getDataLayout()))
91+
+ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable;
92+
+ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>())
93+
+ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant())
94+
+ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant;
95+
+ MachineMemOperand::Flags StoreMOFlags = MOFlags;
96+
+ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
97+
+ DAG.getDataLayout()))
98+
+ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable;
99+
+
100+
+ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the
101+
+ // same number of stores. The loads and stores may get combined into
102+
+ // ldm/stm later on.
103+
+ while (EmittedNumMemOps < NumMemOps) {
104+
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
105+
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
106+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
107+
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
108+
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
109+
+ LoadMOFlags);
110+
+ TFOps[I] = Loads[I].getValue(1);
111+
+ SrcOff += VTSize;
112+
+ }
113+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
114+
+ ArrayRef(TFOps.data(), I));
115+
+
116+
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
117+
+ TFOps[I] = DAG.getStore(
118+
+ Chain, dl, Loads[I],
119+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
120+
+ DAG.getConstant(DstOff, dl, MVT::i32)),
121+
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
122+
+ DstOff += VTSize;
123+
+ }
124+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
125+
+ ArrayRef(TFOps.data(), I));
126+
+
127+
+ EmittedNumMemOps += I;
128+
+ }
129+
+
130+
+ if (BytesLeft == 0)
131+
+ return Chain;
132+
+
133+
+ // Issue loads / stores for the trailing (1 - 3) bytes.
134+
+ unsigned BytesLeftSave = BytesLeft;
135+
+ I = 0;
136+
+ while (BytesLeft) {
137+
+ if (BytesLeft >= 2) {
138+
+ VT = MVT::i16;
139+
+ VTSize = 2;
140+
+ } else {
141+
+ VT = MVT::i8;
142+
+ VTSize = 1;
143+
+ }
144+
+
145+
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
146+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
147+
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
148+
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
149+
+ LoadMOFlags);
150+
+
151+
+ TFOps[I] = Loads[I].getValue(1);
152+
+ ++I;
153+
+ SrcOff += VTSize;
154+
+ BytesLeft -= VTSize;
155+
+ }
156+
+ Chain =
157+
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I));
158+
+
159+
+ I = 0;
160+
+ BytesLeft = BytesLeftSave;
161+
+ while (BytesLeft) {
162+
+ if (BytesLeft >= 2) {
163+
+ VT = MVT::i16;
164+
+ VTSize = 2;
165+
+ } else {
166+
+ VT = MVT::i8;
167+
+ VTSize = 1;
168+
+ }
169+
+
170+
+ TFOps[I] = DAG.getStore(Chain, dl, Loads[I],
171+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
172+
+ DAG.getConstant(DstOff, dl, MVT::i32)),
173+
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0),
174+
+ StoreMOFlags);
175+
+ ++I;
176+
+ DstOff += VTSize;
177+
+ BytesLeft -= VTSize;
178+
+ }
179+
+
180+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
181+
+ ArrayRef(TFOps.data(), I));
182+
+}
183+
+
184+
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
185+
const SelectionDAG &DAG,
186+
ConstantSDNode *ConstantSize,
187+
@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
188+
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
189+
Alignment.value(), RTLIB::MEMCPY);
190+
191+
+ if (Subtarget.UseInlineMemcpyAsLdSt)
192+
+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
193+
+ isVolatile, DstPtrInfo, SrcPtrInfo);
194+
+
195+
unsigned BytesLeft = SizeVal & 3;
196+
unsigned NumMemOps = SizeVal >> 2;
197+
unsigned EmittedNumMemOps = 0;
198+
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
199+
index 275b1c0f8dc0..6ff422c15b12 100644
200+
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
201+
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
202+
@@ -44,6 +44,12 @@ public:
203+
MachinePointerInfo DstPtrInfo,
204+
MachinePointerInfo SrcPtrInfo) const override;
205+
206+
+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
207+
+ const ARMSubtarget &Subtarget, SDValue Chain,
208+
+ SDValue Dst, SDValue Src, uint64_t SizeVal,
209+
+ bool isVolatile, MachinePointerInfo DstPtrInfo,
210+
+ MachinePointerInfo SrcPtrInfo) const;
211+
+
212+
SDValue
213+
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
214+
SDValue Dst, SDValue Src, SDValue Size,
215+
--
216+
2.34.1
217+

0 commit comments

Comments
 (0)