Skip to content

Commit 81f982e

Browse files
committed
[Perf] Update inlining memcpy patch
The patch file for setting inilining memcpy preference was slightly incorrect. Use of the SmallVector data structure caused issues with correctly inlining on some tests. The optimisation was also not enabled for v8.1m mainline targets as it should be. I've updated the patch file to reflect this. Change-Id: Ib7e406f7b08d5928eb8ed8d6d98eb1844fef7fd2
1 parent 77002e7 commit 81f982e

File tree

1 file changed

+34
-61
lines changed

1 file changed

+34
-61
lines changed
Lines changed: 34 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
From 40f07cbde57022b25412cf1c9239755613500d86 Mon Sep 17 00:00:00 2001
1+
From a5ba56aadc91cc59bc8b00b77f42594d08fc31c5 Mon Sep 17 00:00:00 2001
22
From: nasmnc01 <[email protected]>
33
Author: Scott Douglass <[email protected]>
44
Date: Tue, 13 Aug 2024 10:55:51 +0100
5-
Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m
5+
Subject: [PATCH] [ARM][CodeGen]Prefer MEMCPY LDM/STM inlining for v7-m
66

77
This patch changes the behaviour of memcpy inlining on v7m targets.
88
The old behaviour was to inline memcpys with LDM/STM instructions.
@@ -11,57 +11,19 @@ for performance gains of 1% to 2% on selected benchmarks.
1111

1212
Co-authored-by: Nashe Mncube <[email protected]>
1313
---
14-
llvm/lib/Target/ARM/ARMFeatures.td | 5 +
15-
llvm/lib/Target/ARM/ARMProcessors.td | 2 +-
16-
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++++++++
14+
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 116 ++++++++++++++
1715
llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 +
18-
4 files changed, 133 insertions(+), 1 deletion(-)
16+
llvm/lib/Target/ARM/ARMSubtarget.h | 4 +
17+
3 files changed, 126 insertions(+)
1918

20-
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
21-
index bb437698296c..f7fa00aba424 100644
22-
--- a/llvm/lib/Target/ARM/ARMFeatures.td
23-
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
24-
@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
25-
"DisablePostRAScheduler", "true",
26-
"Don't schedule again after register allocation">;
27-
28-
+def FeatureUseInlineMemcpyAsLdSt :
29-
+ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt",
30-
+ "true", "Use memcpy inlining as LD/ST instructions">;
31-
+
32-
+
33-
// Armv8.5-A extensions
34-
35-
// Has speculation barrier.
36-
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
37-
index b94a5fc16146..ffb0c86bc687 100644
38-
--- a/llvm/lib/Target/ARM/ARMProcessors.td
39-
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
40-
@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus",
41-
def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
42-
"Cortex-M3 ARM processors", []>;
43-
def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
44-
- "Cortex-M7 ARM processors", []>;
45-
+ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>;
46-
47-
//===----------------------------------------------------------------------===//
48-
// ARM processors
4919
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
50-
index c57825949c1c..63ae7a042886 100644
20+
index c57825949c1c..0913b2719813 100644
5121
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
5222
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
53-
@@ -12,6 +12,7 @@
54-
55-
#include "ARMTargetMachine.h"
56-
#include "ARMTargetTransformInfo.h"
57-
+#include "llvm/ADT/SmallVector.h"
58-
#include "llvm/CodeGen/SelectionDAG.h"
59-
#include "llvm/IR/DerivedTypes.h"
60-
#include "llvm/Support/CommandLine.h"
61-
@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
23+
@@ -138,6 +138,118 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
6224
return CallResult.second;
6325
}
64-
26+
6527
+SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
6628
+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
6729
+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
@@ -75,8 +37,8 @@ index c57825949c1c..63ae7a042886 100644
7537
+ unsigned I = 0;
7638
+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
7739
+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
78-
+ SmallVector<SDValue> TFOps(6);
79-
+ SmallVector<SDValue> Loads(6);
40+
+ SDValue TFOps[6];
41+
+ SDValue Loads[6];
8042
+ uint64_t SrcOff = 0, DstOff = 0;
8143
+
8244
+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
@@ -107,8 +69,7 @@ index c57825949c1c..63ae7a042886 100644
10769
+ TFOps[I] = Loads[I].getValue(1);
10870
+ SrcOff += VTSize;
10971
+ }
110-
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
111-
+ ArrayRef(TFOps.data(), I));
72+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
11273
+
11374
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
11475
+ TFOps[I] = DAG.getStore(
@@ -118,8 +79,7 @@ index c57825949c1c..63ae7a042886 100644
11879
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
11980
+ DstOff += VTSize;
12081
+ }
121-
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
122-
+ ArrayRef(TFOps.data(), I));
82+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
12383
+
12484
+ EmittedNumMemOps += I;
12585
+ }
@@ -150,8 +110,7 @@ index c57825949c1c..63ae7a042886 100644
150110
+ SrcOff += VTSize;
151111
+ BytesLeft -= VTSize;
152112
+ }
153-
+ Chain =
154-
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I));
113+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
155114
+
156115
+ I = 0;
157116
+ BytesLeft = BytesLeftSave;
@@ -174,18 +133,17 @@ index c57825949c1c..63ae7a042886 100644
174133
+ BytesLeft -= VTSize;
175134
+ }
176135
+
177-
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
178-
+ ArrayRef(TFOps.data(), I));
136+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
179137
+}
180138
+
181139
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
182140
const SelectionDAG &DAG,
183141
ConstantSDNode *ConstantSize,
184-
@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
142+
@@ -192,6 +304,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
185143
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
186144
Alignment.value(), RTLIB::MEMCPY);
187-
188-
+ if (Subtarget.useInlineMemcpyAsLdSt())
145+
146+
+ if (Subtarget.allowInlineMemcpyAsLdSt())
189147
+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
190148
+ isVolatile, DstPtrInfo, SrcPtrInfo);
191149
+
@@ -199,7 +157,7 @@ index 275b1c0f8dc0..6ff422c15b12 100644
199157
@@ -44,6 +44,12 @@ public:
200158
MachinePointerInfo DstPtrInfo,
201159
MachinePointerInfo SrcPtrInfo) const override;
202-
160+
203161
+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
204162
+ const ARMSubtarget &Subtarget, SDValue Chain,
205163
+ SDValue Dst, SDValue Src, uint64_t SizeVal,
@@ -209,6 +167,21 @@ index 275b1c0f8dc0..6ff422c15b12 100644
209167
SDValue
210168
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
211169
SDValue Dst, SDValue Src, SDValue Size,
212-
--
170+
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
171+
index 2f7af05a259f..20aa9e4f334b 100644
172+
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
173+
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
174+
@@ -523,6 +523,10 @@ public:
175+
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
176+
unsigned PhysReg) const override;
177+
unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
178+
+
179+
+ bool allowInlineMemcpyAsLdSt() const {
180+
+ return HasV7Ops && ARMProcClass == MClass;
181+
+ }
182+
};
183+
184+
} // end namespace llvm
185+
--
213186
2.34.1
214187

0 commit comments

Comments
 (0)