[Perf] Update inlining memcpy patch

nasherm · nasherm · commit 3c24c8d37640 · 2024-11-13T10:59:47.000Z
The patch file for setting inilining memcpy preference
was slightly incorrect. Use of the SmallVector data structure
caused issues with correctly inlining on some tests. The
optimisation was also not enabled for v8.1m mainline targets as
it should be. I've updated the patch file to reflect this.

Change-Id: Ib7e406f7b08d5928eb8ed8d6d98eb1844fef7fd2
diff --git a/patches/llvm-project-perf/0002-ARM-Codegen-Set-LDM-STM-inlining-preference-for-v7m.patch b/patches/llvm-project-perf/0002-ARM-Codegen-Set-LDM-STM-inlining-preference-for-v7m.patch
@@ -0,0 +1,189 @@
+From a5ba56aadc91cc59bc8b00b77f42594d08fc31c5 Mon Sep 17 00:00:00 2001
+From: nasmnc01 <nashe.mncube@arm.com>
+Author: Scott Douglass <scott.douglass@arm.com>
+Date: Tue, 13 Aug 2024 10:55:51 +0100
+Subject: [PATCH] [ARM][CodeGen]Prefer MEMCPY LDM/STM inlining for v7-m
+
+This patch changes the behaviour of memcpy inlining on v7m targets.
+The old behaviour was to inline memcpys with LDM/STM instructions.
+Alternatively, using LD/ST instructions for memcpy inlining allowed
+for performance gains of 1% to 2% on selected benchmarks.
+
+Co-authored-by: Nashe Mncube <nashe.mncube@arm.com>
+---
+ llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 116 ++++++++++++++
+ llvm/lib/Target/ARM/ARMSelectionDAGInfo.h   |   6 +
+ llvm/lib/Target/ARM/ARMSubtarget.h          |   4 +
+ llvm/test/CodeGen/ARM/memcpy-v7m.ll         | 165 ++++++++++++++++++++
+ 4 files changed, 291 insertions(+)
+ create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll
+
+diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+index c57825949c1c..0913b2719813 100644
+--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+@@ -138,6 +138,118 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
+   return CallResult.second;
+ }
+
++SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
++    SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
++    SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
++    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
++  // Do repeated batches of 4-byte loads and stores.
++  unsigned BytesLeft = SizeVal & 3;
++  unsigned NumMemOps = SizeVal >> 2;
++  unsigned EmittedNumMemOps = 0;
++  EVT VT = MVT::i32;
++  unsigned VTSize = 4;
++  unsigned I = 0;
++  // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
++  const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
++  SDValue TFOps[6];
++  SDValue Loads[6];
++  uint64_t SrcOff = 0, DstOff = 0;
++
++  MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
++  if (isVolatile)
++    MOFlags = MachineMemOperand::Flags::MOVolatile;
++  MachineMemOperand::Flags LoadMOFlags = MOFlags;
++  if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
++                                   DAG.getDataLayout()))
++    LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable;
++  if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>())
++    if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant())
++      LoadMOFlags |= MachineMemOperand::Flags::MOInvariant;
++  MachineMemOperand::Flags StoreMOFlags = MOFlags;
++  if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
++                                   DAG.getDataLayout()))
++    StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable;
++
++  // Emit up to MaxLoads loads, then a TokenFactor barrier, then the
++  // same number of stores.  The loads and stores may get combined into
++  // ldm/stm later on.
++  while (EmittedNumMemOps < NumMemOps) {
++    for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
++      Loads[I] = DAG.getLoad(VT, dl, Chain,
++                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
++                                         DAG.getConstant(SrcOff, dl, MVT::i32)),
++                             SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
++                             LoadMOFlags);
++      TFOps[I] = Loads[I].getValue(1);
++      SrcOff += VTSize;
++    }
++    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
++
++    for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
++      TFOps[I] = DAG.getStore(
++          Chain, dl, Loads[I],
++          DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
++                      DAG.getConstant(DstOff, dl, MVT::i32)),
++          DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
++      DstOff += VTSize;
++    }
++    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
++
++    EmittedNumMemOps += I;
++  }
++
++  if (BytesLeft == 0)
++    return Chain;
++
++  // Issue loads / stores for the trailing (1 - 3) bytes.
++  unsigned BytesLeftSave = BytesLeft;
++  I = 0;
++  while (BytesLeft) {
++    if (BytesLeft >= 2) {
++      VT = MVT::i16;
++      VTSize = 2;
++    } else {
++      VT = MVT::i8;
++      VTSize = 1;
++    }
++
++    Loads[I] = DAG.getLoad(VT, dl, Chain,
++                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
++                                       DAG.getConstant(SrcOff, dl, MVT::i32)),
++                           SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
++                           LoadMOFlags);
++
++    TFOps[I] = Loads[I].getValue(1);
++    ++I;
++    SrcOff += VTSize;
++    BytesLeft -= VTSize;
++  }
++  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
++
++  I = 0;
++  BytesLeft = BytesLeftSave;
++  while (BytesLeft) {
++    if (BytesLeft >= 2) {
++      VT = MVT::i16;
++      VTSize = 2;
++    } else {
++      VT = MVT::i8;
++      VTSize = 1;
++    }
++
++    TFOps[I] = DAG.getStore(Chain, dl, Loads[I],
++                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
++                                        DAG.getConstant(DstOff, dl, MVT::i32)),
++                            DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0),
++                            StoreMOFlags);
++    ++I;
++    DstOff += VTSize;
++    BytesLeft -= VTSize;
++  }
++
++  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I));
++}
++
+ static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
+                                        const SelectionDAG &DAG,
+                                        ConstantSDNode *ConstantSize,
+@@ -192,6 +304,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
+     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                   Alignment.value(), RTLIB::MEMCPY);
+
++  if (Subtarget.allowInlineMemcpyAsLdSt())
++    return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
++                            isVolatile, DstPtrInfo, SrcPtrInfo);
++
+   unsigned BytesLeft = SizeVal & 3;
+   unsigned NumMemOps = SizeVal >> 2;
+   unsigned EmittedNumMemOps = 0;
+diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+index 275b1c0f8dc0..6ff422c15b12 100644
+--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+@@ -44,6 +44,12 @@ public:
+                                   MachinePointerInfo DstPtrInfo,
+                                   MachinePointerInfo SrcPtrInfo) const override;
+
++  SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
++                           const ARMSubtarget &Subtarget, SDValue Chain,
++                           SDValue Dst, SDValue Src, uint64_t SizeVal,
++                           bool isVolatile, MachinePointerInfo DstPtrInfo,
++                           MachinePointerInfo SrcPtrInfo) const;
++
+   SDValue
+   EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                            SDValue Dst, SDValue Src, SDValue Size,
+diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
+index 2f7af05a259f..20aa9e4f334b 100644
+--- a/llvm/lib/Target/ARM/ARMSubtarget.h
++++ b/llvm/lib/Target/ARM/ARMSubtarget.h
+@@ -523,6 +523,10 @@ public:
+   bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                    unsigned PhysReg) const override;
+   unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
++
++  bool allowInlineMemcpyAsLdSt() const {
++    return HasV7Ops && ARMProcClass == MClass;
++  }
+ };
+
+ } // end namespace llvm
+--
+2.34.1
+