Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
From 61af6af10d10a08b81d3924fa5b35bfb548b2a05 Mon Sep 17 00:00:00 2001
From: nasmnc01 <[email protected]>
Author: Scott Douglass <[email protected]>
Date: Tue, 13 Aug 2024 10:55:51 +0100
Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m

This patch changes the behaviour of memcpy inlining on v7m targets.
The old behaviour was to inline memcpys with LDM/STM instructions.
Alternatively, using LD/ST instructions for memcpy inlining allowed
for performance gains of 1% to 2% on selected benchmarks.

Co-authored-by: Nashe Mncube <[email protected]>
---
llvm/lib/Target/ARM/ARMFeatures.td | 5 +
llvm/lib/Target/ARM/ARMProcessors.td | 2 +-
llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++
llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 +
llvm/lib/Target/ARM/ARMSubtarget.h | 2 +
llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++
6 files changed, 300 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll

diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index bb437698296c..f7fa00aba424 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
"DisablePostRAScheduler", "true",
"Don't schedule again after register allocation">;

+def FeatureUseInlineMemcpyAsLdSt :
+ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt",
+ "true", "Use memcpy inlining as LD/ST instructions">;
+
+
// Armv8.5-A extensions

// Has speculation barrier.
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index b94a5fc16146..ffb0c86bc687 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus",
def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
"Cortex-M3 ARM processors", []>;
def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
- "Cortex-M7 ARM processors", []>;
+ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>;

//===----------------------------------------------------------------------===//
// ARM processors
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index c57825949c1c..12db2ab1fca2 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -12,6 +12,7 @@

#include "ARMTargetMachine.h"
#include "ARMTargetTransformInfo.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/CommandLine.h"
@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
return CallResult.second;
}

+SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt(
+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain,
+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // Do repeated batches of 4-byte loads and stores.
+ unsigned BytesLeft = SizeVal & 3;
+ unsigned NumMemOps = SizeVal >> 2;
+ unsigned EmittedNumMemOps = 0;
+ EVT VT = MVT::i32;
+ unsigned VTSize = 4;
+ unsigned I = 0;
+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6;
+ SmallVector<SDValue> TFOps(6);
+ SmallVector<SDValue> Loads(6);
+ uint64_t SrcOff = 0, DstOff = 0;
+
+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone;
+ if (isVolatile)
+ MOFlags = MachineMemOperand::Flags::MOVolatile;
+ MachineMemOperand::Flags LoadMOFlags = MOFlags;
+ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
+ DAG.getDataLayout()))
+ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable;
+ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>())
+ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant())
+ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant;
+ MachineMemOperand::Flags StoreMOFlags = MOFlags;
+ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(),
+ DAG.getDataLayout()))
+ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable;
+
+ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the
+ // same number of stores. The loads and stores may get combined into
+ // ldm/stm later on.
+ while (EmittedNumMemOps < NumMemOps) {
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
+ LoadMOFlags);
+ TFOps[I] = Loads[I].getValue(1);
+ SrcOff += VTSize;
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ ArrayRef(TFOps.data(), I));
+
+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) {
+ TFOps[I] = DAG.getStore(
+ Chain, dl, Loads[I],
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+ DAG.getConstant(DstOff, dl, MVT::i32)),
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags);
+ DstOff += VTSize;
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ ArrayRef(TFOps.data(), I));
+
+ EmittedNumMemOps += I;
+ }
+
+ if (BytesLeft == 0)
+ return Chain;
+
+ // Issue loads / stores for the trailing (1 - 3) bytes.
+ unsigned BytesLeftSave = BytesLeft;
+ I = 0;
+ while (BytesLeft) {
+ if (BytesLeft >= 2) {
+ VT = MVT::i16;
+ VTSize = 2;
+ } else {
+ VT = MVT::i8;
+ VTSize = 1;
+ }
+
+ Loads[I] = DAG.getLoad(VT, dl, Chain,
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+ DAG.getConstant(SrcOff, dl, MVT::i32)),
+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0),
+ LoadMOFlags);
+
+ TFOps[I] = Loads[I].getValue(1);
+ ++I;
+ SrcOff += VTSize;
+ BytesLeft -= VTSize;
+ }
+ Chain =
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I));
+
+ I = 0;
+ BytesLeft = BytesLeftSave;
+ while (BytesLeft) {
+ if (BytesLeft >= 2) {
+ VT = MVT::i16;
+ VTSize = 2;
+ } else {
+ VT = MVT::i8;
+ VTSize = 1;
+ }
+
+ TFOps[I] = DAG.getStore(Chain, dl, Loads[I],
+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+ DAG.getConstant(DstOff, dl, MVT::i32)),
+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0),
+ StoreMOFlags);
+ ++I;
+ DstOff += VTSize;
+ BytesLeft -= VTSize;
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ ArrayRef(TFOps.data(), I));
+}
+
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
const SelectionDAG &DAG,
ConstantSDNode *ConstantSize,
@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);

+ if (Subtarget.UseInlineMemcpyAsLdSt)
+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal,
+ isVolatile, DstPtrInfo, SrcPtrInfo);
+
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
unsigned EmittedNumMemOps = 0;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index 275b1c0f8dc0..6ff422c15b12 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -44,6 +44,12 @@ public:
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const override;

+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl,
+ const ARMSubtarget &Subtarget, SDValue Chain,
+ SDValue Dst, SDValue Src, uint64_t SizeVal,
+ bool isVolatile, MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const;
+
SDValue
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
SDValue Dst, SDValue Src, SDValue Size,
--
2.34.1

Loading