diff --git a/patches/llvm-project-perf/0002-ARM-CodeGen-Disable-MEMCPY-LDM-STM-inlining-for-v7-m.patch b/patches/llvm-project-perf/0002-ARM-CodeGen-Disable-MEMCPY-LDM-STM-inlining-for-v7-m.patch new file mode 100644 index 00000000..38d3e2d8 --- /dev/null +++ b/patches/llvm-project-perf/0002-ARM-CodeGen-Disable-MEMCPY-LDM-STM-inlining-for-v7-m.patch @@ -0,0 +1,217 @@ +From 61af6af10d10a08b81d3924fa5b35bfb548b2a05 Mon Sep 17 00:00:00 2001 +From: nasmnc01 +Author: Scott Douglass +Date: Tue, 13 Aug 2024 10:55:51 +0100 +Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m + +This patch changes the behaviour of memcpy inlining on v7m targets. +The old behaviour was to inline memcpys with LDM/STM instructions. +Alternatively, using LD/ST instructions for memcpy inlining allowed +for performance gains of 1% to 2% on selected benchmarks. + +Co-authored-by: Nashe Mncube +--- + llvm/lib/Target/ARM/ARMFeatures.td | 5 + + llvm/lib/Target/ARM/ARMProcessors.td | 2 +- + llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++ + llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 + + llvm/lib/Target/ARM/ARMSubtarget.h | 2 + + llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++ + 6 files changed, 300 insertions(+), 1 deletion(-) + create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll + +diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td +index bb437698296c..f7fa00aba424 100644 +--- a/llvm/lib/Target/ARM/ARMFeatures.td ++++ b/llvm/lib/Target/ARM/ARMFeatures.td +@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", + "DisablePostRAScheduler", "true", + "Don't schedule again after register allocation">; + ++def FeatureUseInlineMemcpyAsLdSt : ++ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt", ++ "true", "Use memcpy inlining as LD/ST instructions">; ++ ++ + // Armv8.5-A extensions + + // Has speculation barrier. +diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td +index b94a5fc16146..ffb0c86bc687 100644 +--- a/llvm/lib/Target/ARM/ARMProcessors.td ++++ b/llvm/lib/Target/ARM/ARMProcessors.td +@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus", + def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", + "Cortex-M3 ARM processors", []>; + def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", +- "Cortex-M7 ARM processors", []>; ++ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>; + + //===----------------------------------------------------------------------===// + // ARM processors +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +index c57825949c1c..12db2ab1fca2 100644 +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +@@ -12,6 +12,7 @@ + + #include "ARMTargetMachine.h" + #include "ARMTargetTransformInfo.h" ++#include "llvm/ADT/SmallVector.h" + #include "llvm/CodeGen/SelectionDAG.h" + #include "llvm/IR/DerivedTypes.h" + #include "llvm/Support/CommandLine.h" +@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( + return CallResult.second; + } + ++SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt( ++ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain, ++ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile, ++ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { ++ // Do repeated batches of 4-byte loads and stores. ++ unsigned BytesLeft = SizeVal & 3; ++ unsigned NumMemOps = SizeVal >> 2; ++ unsigned EmittedNumMemOps = 0; ++ EVT VT = MVT::i32; ++ unsigned VTSize = 4; ++ unsigned I = 0; ++ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers ++ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6; ++ SmallVector TFOps(6); ++ SmallVector Loads(6); ++ uint64_t SrcOff = 0, DstOff = 0; ++ ++ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone; ++ if (isVolatile) ++ MOFlags = MachineMemOperand::Flags::MOVolatile; ++ MachineMemOperand::Flags LoadMOFlags = MOFlags; ++ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), ++ DAG.getDataLayout())) ++ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable; ++ if (auto *V = SrcPtrInfo.V.dyn_cast()) ++ if (isa(V) && cast(V)->isConstant()) ++ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant; ++ MachineMemOperand::Flags StoreMOFlags = MOFlags; ++ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), ++ DAG.getDataLayout())) ++ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable; ++ ++ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the ++ // same number of stores. The loads and stores may get combined into ++ // ldm/stm later on. ++ while (EmittedNumMemOps < NumMemOps) { ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { ++ Loads[I] = DAG.getLoad(VT, dl, Chain, ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, ++ DAG.getConstant(SrcOff, dl, MVT::i32)), ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), ++ LoadMOFlags); ++ TFOps[I] = Loads[I].getValue(1); ++ SrcOff += VTSize; ++ } ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ++ ArrayRef(TFOps.data(), I)); ++ ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { ++ TFOps[I] = DAG.getStore( ++ Chain, dl, Loads[I], ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, ++ DAG.getConstant(DstOff, dl, MVT::i32)), ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags); ++ DstOff += VTSize; ++ } ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ++ ArrayRef(TFOps.data(), I)); ++ ++ EmittedNumMemOps += I; ++ } ++ ++ if (BytesLeft == 0) ++ return Chain; ++ ++ // Issue loads / stores for the trailing (1 - 3) bytes. ++ unsigned BytesLeftSave = BytesLeft; ++ I = 0; ++ while (BytesLeft) { ++ if (BytesLeft >= 2) { ++ VT = MVT::i16; ++ VTSize = 2; ++ } else { ++ VT = MVT::i8; ++ VTSize = 1; ++ } ++ ++ Loads[I] = DAG.getLoad(VT, dl, Chain, ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, ++ DAG.getConstant(SrcOff, dl, MVT::i32)), ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), ++ LoadMOFlags); ++ ++ TFOps[I] = Loads[I].getValue(1); ++ ++I; ++ SrcOff += VTSize; ++ BytesLeft -= VTSize; ++ } ++ Chain = ++ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I)); ++ ++ I = 0; ++ BytesLeft = BytesLeftSave; ++ while (BytesLeft) { ++ if (BytesLeft >= 2) { ++ VT = MVT::i16; ++ VTSize = 2; ++ } else { ++ VT = MVT::i8; ++ VTSize = 1; ++ } ++ ++ TFOps[I] = DAG.getStore(Chain, dl, Loads[I], ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, ++ DAG.getConstant(DstOff, dl, MVT::i32)), ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), ++ StoreMOFlags); ++ ++I; ++ DstOff += VTSize; ++ BytesLeft -= VTSize; ++ } ++ ++ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ++ ArrayRef(TFOps.data(), I)); ++} ++ + static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, + const SelectionDAG &DAG, + ConstantSDNode *ConstantSize, +@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMCPY); + ++ if (Subtarget.UseInlineMemcpyAsLdSt) ++ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal, ++ isVolatile, DstPtrInfo, SrcPtrInfo); ++ + unsigned BytesLeft = SizeVal & 3; + unsigned NumMemOps = SizeVal >> 2; + unsigned EmittedNumMemOps = 0; +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +index 275b1c0f8dc0..6ff422c15b12 100644 +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +@@ -44,6 +44,12 @@ public: + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + ++ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl, ++ const ARMSubtarget &Subtarget, SDValue Chain, ++ SDValue Dst, SDValue Src, uint64_t SizeVal, ++ bool isVolatile, MachinePointerInfo DstPtrInfo, ++ MachinePointerInfo SrcPtrInfo) const; ++ + SDValue + EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, + SDValue Dst, SDValue Src, SDValue Size, +-- +2.34.1 +