1- From 40f07cbde57022b25412cf1c9239755613500d86 Mon Sep 17 00:00:00 2001
1+ From a5ba56aadc91cc59bc8b00b77f42594d08fc31c5 Mon Sep 17 00:00:00 2001
22From: nasmnc01 <
[email protected] >
33Author: Scott Douglass <
[email protected] >
44Date: Tue, 13 Aug 2024 10:55:51 +0100
5- Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m
5+ Subject: [PATCH] [ARM][CodeGen]Prefer MEMCPY LDM/STM inlining for v7-m
66
77This patch changes the behaviour of memcpy inlining on v7m targets.
88The old behaviour was to inline memcpys with LDM/STM instructions.
@@ -11,57 +11,19 @@ for performance gains of 1% to 2% on selected benchmarks.
1111
1212Co-authored-by: Nashe Mncube <
[email protected] >
1313---
14- llvm/lib/Target/ARM/ARMFeatures.td | 5 +
15- llvm/lib/Target/ARM/ARMProcessors.td | 2 +-
16- llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++++++++
14+ llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 116 ++++++++++++++
1715 llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 +
18- 4 files changed, 133 insertions(+), 1 deletion(-)
16+ llvm/lib/Target/ARM/ARMSubtarget.h | 4 +
17+ 3 files changed, 126 insertions(+)
1918
20- diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
21- index bb437698296c..f7fa00aba424 100644
22- --- a/llvm/lib/Target/ARM/ARMFeatures.td
23- +++ b/llvm/lib/Target/ARM/ARMFeatures.td
24- @@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
25- "DisablePostRAScheduler", "true",
26- "Don't schedule again after register allocation">;
27-
28- + def FeatureUseInlineMemcpyAsLdSt :
29- + SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt",
30- + "true", "Use memcpy inlining as LD/ST instructions">;
31- +
32- +
33- // Armv8.5-A extensions
34-
35- // Has speculation barrier.
36- diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
37- index b94a5fc16146..ffb0c86bc687 100644
38- --- a/llvm/lib/Target/ARM/ARMProcessors.td
39- +++ b/llvm/lib/Target/ARM/ARMProcessors.td
40- @@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus",
41- def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
42- "Cortex-M3 ARM processors", []>;
43- def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
44- - "Cortex-M7 ARM processors", []>;
45- + "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>;
46-
47- //===----------------------------------------------------------------------===//
48- // ARM processors
4919diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
50- index c57825949c1c..63ae7a042886 100644
20+ index c57825949c1c..0913b2719813 100644
5121--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
5222+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
53- @@ -12,6 +12,7 @@
54-
55- #include "ARMTargetMachine.h"
56- #include "ARMTargetTransformInfo.h"
57- + #include "llvm/ADT/SmallVector.h"
58- #include "llvm/CodeGen/SelectionDAG.h"
59- #include "llvm/IR/DerivedTypes.h"
60- #include "llvm/Support/CommandLine.h"
61- @@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
23+ @@ -138,6 +138,118 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
6224 return CallResult.second;
6325 }
64-
26+
6527+ SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
6628+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
6729+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
@@ -75,8 +37,8 @@ index c57825949c1c..63ae7a042886 100644
7537+ unsigned I = 0;
7638+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
7739+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
78- + SmallVector< SDValue> TFOps(6) ;
79- + SmallVector< SDValue> Loads(6) ;
40+ + SDValue TFOps[6] ;
41+ + SDValue Loads[6] ;
8042+ uint64_t SrcOff = 0, DstOff = 0;
8143+
8244+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
@@ -107,8 +69,7 @@ index c57825949c1c..63ae7a042886 100644
10769+ TFOps[I] = Loads[I].getValue(1);
10870+ SrcOff += VTSize;
10971+ }
110- + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
111- + ArrayRef(TFOps.data(), I));
72+ + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
11273+
11374+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
11475+ TFOps[I] = DAG.getStore(
@@ -118,8 +79,7 @@ index c57825949c1c..63ae7a042886 100644
11879+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
11980+ DstOff += VTSize;
12081+ }
121- + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
122- + ArrayRef(TFOps.data(), I));
82+ + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
12383+
12484+ EmittedNumMemOps += I;
12585+ }
@@ -150,8 +110,7 @@ index c57825949c1c..63ae7a042886 100644
150110+ SrcOff += VTSize;
151111+ BytesLeft -= VTSize;
152112+ }
153- + Chain =
154- + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I));
113+ + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
155114+
156115+ I = 0;
157116+ BytesLeft = BytesLeftSave;
@@ -174,18 +133,17 @@ index c57825949c1c..63ae7a042886 100644
174133+ BytesLeft -= VTSize;
175134+ }
176135+
177- + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
178- + ArrayRef(TFOps.data(), I));
136+ + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
179137+ }
180138+
181139 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
182140 const SelectionDAG &DAG,
183141 ConstantSDNode *ConstantSize,
184- @@ -192,6 +309 ,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
142+ @@ -192,6 +304 ,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
185143 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
186144 Alignment.value(), RTLIB::MEMCPY);
187-
188- + if (Subtarget.useInlineMemcpyAsLdSt ())
145+
146+ + if (Subtarget.allowInlineMemcpyAsLdSt ())
189147+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
190148+ isVolatile, DstPtrInfo, SrcPtrInfo);
191149+
@@ -199,7 +157,7 @@ index 275b1c0f8dc0..6ff422c15b12 100644
199157@@ -44,6 +44,12 @@ public:
200158 MachinePointerInfo DstPtrInfo,
201159 MachinePointerInfo SrcPtrInfo) const override;
202-
160+
203161+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
204162+ const ARMSubtarget &Subtarget, SDValue Chain,
205163+ SDValue Dst, SDValue Src, uint64_t SizeVal,
@@ -209,6 +167,21 @@ index 275b1c0f8dc0..6ff422c15b12 100644
209167 SDValue
210168 EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
211169 SDValue Dst, SDValue Src, SDValue Size,
212- - -
170+ diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
171+ index 2f7af05a259f..20aa9e4f334b 100644
172+ --- a/llvm/lib/Target/ARM/ARMSubtarget.h
173+ +++ b/llvm/lib/Target/ARM/ARMSubtarget.h
174+ @@ -523,6 +523,10 @@ public:
175+ bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
176+ unsigned PhysReg) const override;
177+ unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
178+ +
179+ + bool allowInlineMemcpyAsLdSt() const {
180+ + return HasV7Ops && ARMProcClass == MClass;
181+ + }
182+ };
183+
184+ } // end namespace llvm
185+ - -
2131862.34.1
214187
0 commit comments