Skip to content

Commit 3c24c8d

Browse files
committed
[Perf] Update inlining memcpy patch
The patch file for setting inilining memcpy preference was slightly incorrect. Use of the SmallVector data structure caused issues with correctly inlining on some tests. The optimisation was also not enabled for v8.1m mainline targets as it should be. I've updated the patch file to reflect this. Change-Id: Ib7e406f7b08d5928eb8ed8d6d98eb1844fef7fd2
1 parent 77002e7 commit 3c24c8d

File tree

1 file changed

+189
-0
lines changed

1 file changed

+189
-0
lines changed
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
From a5ba56aadc91cc59bc8b00b77f42594d08fc31c5 Mon Sep 17 00:00:00 2001
2+
From: nasmnc01 <[email protected]>
3+
Author: Scott Douglass <[email protected]>
4+
Date: Tue, 13 Aug 2024 10:55:51 +0100
5+
Subject: [PATCH] [ARM][CodeGen]Prefer MEMCPY LDM/STM inlining for v7-m
6+
7+
This patch changes the behaviour of memcpy inlining on v7m targets.
8+
The old behaviour was to inline memcpys with LDM/STM instructions.
9+
Alternatively, using LD/ST instructions for memcpy inlining allowed
10+
for performance gains of 1% to 2% on selected benchmarks.
11+
12+
Co-authored-by: Nashe Mncube <[email protected]>
13+
---
14+
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 116 ++++++++++++++
15+
llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 +
16+
llvm/lib/Target/ARM/ARMSubtarget.h | 4 +
17+
llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++
18+
4 files changed, 291 insertions(+)
19+
create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll
20+
21+
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
22+
index c57825949c1c..0913b2719813 100644
23+
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
24+
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
25+
@@ -138,6 +138,118 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
26+
return CallResult.second;
27+
}
28+
29+
+SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
30+
+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
31+
+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
32+
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
33+
+ // Do repeated batches of 4-byte loads and stores.
34+
+ unsigned BytesLeft = SizeVal & 3;
35+
+ unsigned NumMemOps = SizeVal >> 2;
36+
+ unsigned EmittedNumMemOps = 0;
37+
+ EVT VT = MVT::i32;
38+
+ unsigned VTSize = 4;
39+
+ unsigned I = 0;
40+
+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
41+
+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
42+
+ SDValue TFOps[6];
43+
+ SDValue Loads[6];
44+
+ uint64_t SrcOff = 0, DstOff = 0;
45+
+
46+
+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
47+
+ if (isVolatile)
48+
+ MOFlags = MachineMemOperand::Flags::MOVolatile;
49+
+ MachineMemOperand::Flags LoadMOFlags = MOFlags;
50+
+ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
51+
+ DAG.getDataLayout()))
52+
+ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable;
53+
+ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>())
54+
+ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant())
55+
+ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant;
56+
+ MachineMemOperand::Flags StoreMOFlags = MOFlags;
57+
+ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
58+
+ DAG.getDataLayout()))
59+
+ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable;
60+
+
61+
+ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the
62+
+ // same number of stores. The loads and stores may get combined into
63+
+ // ldm/stm later on.
64+
+ while (EmittedNumMemOps < NumMemOps) {
65+
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
66+
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
67+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
68+
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
69+
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
70+
+ LoadMOFlags);
71+
+ TFOps[I] = Loads[I].getValue(1);
72+
+ SrcOff += VTSize;
73+
+ }
74+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
75+
+
76+
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
77+
+ TFOps[I] = DAG.getStore(
78+
+ Chain, dl, Loads[I],
79+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
80+
+ DAG.getConstant(DstOff, dl, MVT::i32)),
81+
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
82+
+ DstOff += VTSize;
83+
+ }
84+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
85+
+
86+
+ EmittedNumMemOps += I;
87+
+ }
88+
+
89+
+ if (BytesLeft == 0)
90+
+ return Chain;
91+
+
92+
+ // Issue loads / stores for the trailing (1 - 3) bytes.
93+
+ unsigned BytesLeftSave = BytesLeft;
94+
+ I = 0;
95+
+ while (BytesLeft) {
96+
+ if (BytesLeft >= 2) {
97+
+ VT = MVT::i16;
98+
+ VTSize = 2;
99+
+ } else {
100+
+ VT = MVT::i8;
101+
+ VTSize = 1;
102+
+ }
103+
+
104+
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
105+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
106+
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
107+
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
108+
+ LoadMOFlags);
109+
+
110+
+ TFOps[I] = Loads[I].getValue(1);
111+
+ ++I;
112+
+ SrcOff += VTSize;
113+
+ BytesLeft -= VTSize;
114+
+ }
115+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
116+
+
117+
+ I = 0;
118+
+ BytesLeft = BytesLeftSave;
119+
+ while (BytesLeft) {
120+
+ if (BytesLeft >= 2) {
121+
+ VT = MVT::i16;
122+
+ VTSize = 2;
123+
+ } else {
124+
+ VT = MVT::i8;
125+
+ VTSize = 1;
126+
+ }
127+
+
128+
+ TFOps[I] = DAG.getStore(Chain, dl, Loads[I],
129+
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
130+
+ DAG.getConstant(DstOff, dl, MVT::i32)),
131+
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0),
132+
+ StoreMOFlags);
133+
+ ++I;
134+
+ DstOff += VTSize;
135+
+ BytesLeft -= VTSize;
136+
+ }
137+
+
138+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
139+
+}
140+
+
141+
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
142+
const SelectionDAG &DAG,
143+
ConstantSDNode *ConstantSize,
144+
@@ -192,6 +304,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
145+
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
146+
Alignment.value(), RTLIB::MEMCPY);
147+
148+
+ if (Subtarget.allowInlineMemcpyAsLdSt())
149+
+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
150+
+ isVolatile, DstPtrInfo, SrcPtrInfo);
151+
+
152+
unsigned BytesLeft = SizeVal & 3;
153+
unsigned NumMemOps = SizeVal >> 2;
154+
unsigned EmittedNumMemOps = 0;
155+
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
156+
index 275b1c0f8dc0..6ff422c15b12 100644
157+
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
158+
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
159+
@@ -44,6 +44,12 @@ public:
160+
MachinePointerInfo DstPtrInfo,
161+
MachinePointerInfo SrcPtrInfo) const override;
162+
163+
+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
164+
+ const ARMSubtarget &Subtarget, SDValue Chain,
165+
+ SDValue Dst, SDValue Src, uint64_t SizeVal,
166+
+ bool isVolatile, MachinePointerInfo DstPtrInfo,
167+
+ MachinePointerInfo SrcPtrInfo) const;
168+
+
169+
SDValue
170+
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
171+
SDValue Dst, SDValue Src, SDValue Size,
172+
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
173+
index 2f7af05a259f..20aa9e4f334b 100644
174+
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
175+
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
176+
@@ -523,6 +523,10 @@ public:
177+
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
178+
unsigned PhysReg) const override;
179+
unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
180+
+
181+
+ bool allowInlineMemcpyAsLdSt() const {
182+
+ return HasV7Ops && ARMProcClass == MClass;
183+
+ }
184+
};
185+
186+
} // end namespace llvm
187+
--
188+
2.34.1
189+

0 commit comments

Comments
 (0)