diff --git a/Omax.cfg b/Omax.cfg index 6918418c..78bec9fb 100644 --- a/Omax.cfg +++ b/Omax.cfg @@ -8,3 +8,4 @@ -mllvm -enable-dfa-jump-thread \ -mllvm -enable-loop-flatten \ -mllvm -enable-unroll-and-jam \ +-mllvm -enable-inline-memcpy-ld-st diff --git a/patches/llvm-project-perf/0002-ARM-CodeGen-Disable-MEMCPY-LDM-STM-inlining-for-v7-m.patch b/patches/llvm-project-perf/0002-ARM-CodeGen-Disable-MEMCPY-LDM-STM-inlining-for-v7-m.patch deleted file mode 100644 index b25c6f9b..00000000 --- a/patches/llvm-project-perf/0002-ARM-CodeGen-Disable-MEMCPY-LDM-STM-inlining-for-v7-m.patch +++ /dev/null @@ -1,214 +0,0 @@ -From 40f07cbde57022b25412cf1c9239755613500d86 Mon Sep 17 00:00:00 2001 -From: nasmnc01 -Author: Scott Douglass -Date: Tue, 13 Aug 2024 10:55:51 +0100 -Subject: [PATCH] [ARM][CodeGen] Prefer MEMCPY LDM/STM inlining for v7-m - -This patch changes the behaviour of memcpy inlining on v7m targets. -The old behaviour was to inline memcpys with LDM/STM instructions. -Alternatively, using LD/ST instructions for memcpy inlining allowed -for performance gains of 1% to 2% on selected benchmarks. - -Co-authored-by: Nashe Mncube ---- - llvm/lib/Target/ARM/ARMFeatures.td | 5 + - llvm/lib/Target/ARM/ARMProcessors.td | 2 +- - llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++++++++ - llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 + - 4 files changed, 133 insertions(+), 1 deletion(-) - -diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td -index bb437698296c..f7fa00aba424 100644 ---- a/llvm/lib/Target/ARM/ARMFeatures.td -+++ b/llvm/lib/Target/ARM/ARMFeatures.td -@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", - "DisablePostRAScheduler", "true", - "Don't schedule again after register allocation">; - -+def FeatureUseInlineMemcpyAsLdSt : -+ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt", -+ "true", "Use memcpy inlining as LD/ST instructions">; -+ -+ - // Armv8.5-A extensions - - // Has speculation barrier. -diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td -index b94a5fc16146..ffb0c86bc687 100644 ---- a/llvm/lib/Target/ARM/ARMProcessors.td -+++ b/llvm/lib/Target/ARM/ARMProcessors.td -@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus", - def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", - "Cortex-M3 ARM processors", []>; - def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", -- "Cortex-M7 ARM processors", []>; -+ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>; - - //===----------------------------------------------------------------------===// - // ARM processors -diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp -index c57825949c1c..63ae7a042886 100644 ---- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp -+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp -@@ -12,6 +12,7 @@ - - #include "ARMTargetMachine.h" - #include "ARMTargetTransformInfo.h" -+#include "llvm/ADT/SmallVector.h" - #include "llvm/CodeGen/SelectionDAG.h" - #include "llvm/IR/DerivedTypes.h" - #include "llvm/Support/CommandLine.h" -@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( - return CallResult.second; - } - -+SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt( -+ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain, -+ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile, -+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { -+ // Do repeated batches of 4-byte loads and stores. -+ unsigned BytesLeft = SizeVal & 3; -+ unsigned NumMemOps = SizeVal >> 2; -+ unsigned EmittedNumMemOps = 0; -+ EVT VT = MVT::i32; -+ unsigned VTSize = 4; -+ unsigned I = 0; -+ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers -+ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6; -+ SmallVector TFOps(6); -+ SmallVector Loads(6); -+ uint64_t SrcOff = 0, DstOff = 0; -+ -+ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone; -+ if (isVolatile) -+ MOFlags = MachineMemOperand::Flags::MOVolatile; -+ MachineMemOperand::Flags LoadMOFlags = MOFlags; -+ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), -+ DAG.getDataLayout())) -+ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable; -+ if (auto *V = SrcPtrInfo.V.dyn_cast()) -+ if (isa(V) && cast(V)->isConstant()) -+ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant; -+ MachineMemOperand::Flags StoreMOFlags = MOFlags; -+ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), -+ DAG.getDataLayout())) -+ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable; -+ -+ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the -+ // same number of stores. The loads and stores may get combined into -+ // ldm/stm later on. -+ while (EmittedNumMemOps < NumMemOps) { -+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { -+ Loads[I] = DAG.getLoad(VT, dl, Chain, -+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, -+ DAG.getConstant(SrcOff, dl, MVT::i32)), -+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), -+ LoadMOFlags); -+ TFOps[I] = Loads[I].getValue(1); -+ SrcOff += VTSize; -+ } -+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, -+ ArrayRef(TFOps.data(), I)); -+ -+ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { -+ TFOps[I] = DAG.getStore( -+ Chain, dl, Loads[I], -+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, -+ DAG.getConstant(DstOff, dl, MVT::i32)), -+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags); -+ DstOff += VTSize; -+ } -+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, -+ ArrayRef(TFOps.data(), I)); -+ -+ EmittedNumMemOps += I; -+ } -+ -+ if (BytesLeft == 0) -+ return Chain; -+ -+ // Issue loads / stores for the trailing (1 - 3) bytes. -+ unsigned BytesLeftSave = BytesLeft; -+ I = 0; -+ while (BytesLeft) { -+ if (BytesLeft >= 2) { -+ VT = MVT::i16; -+ VTSize = 2; -+ } else { -+ VT = MVT::i8; -+ VTSize = 1; -+ } -+ -+ Loads[I] = DAG.getLoad(VT, dl, Chain, -+ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, -+ DAG.getConstant(SrcOff, dl, MVT::i32)), -+ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), -+ LoadMOFlags); -+ -+ TFOps[I] = Loads[I].getValue(1); -+ ++I; -+ SrcOff += VTSize; -+ BytesLeft -= VTSize; -+ } -+ Chain = -+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I)); -+ -+ I = 0; -+ BytesLeft = BytesLeftSave; -+ while (BytesLeft) { -+ if (BytesLeft >= 2) { -+ VT = MVT::i16; -+ VTSize = 2; -+ } else { -+ VT = MVT::i8; -+ VTSize = 1; -+ } -+ -+ TFOps[I] = DAG.getStore(Chain, dl, Loads[I], -+ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, -+ DAG.getConstant(DstOff, dl, MVT::i32)), -+ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), -+ StoreMOFlags); -+ ++I; -+ DstOff += VTSize; -+ BytesLeft -= VTSize; -+ } -+ -+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, -+ ArrayRef(TFOps.data(), I)); -+} -+ - static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, - const SelectionDAG &DAG, - ConstantSDNode *ConstantSize, -@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( - return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, - Alignment.value(), RTLIB::MEMCPY); - -+ if (Subtarget.useInlineMemcpyAsLdSt()) -+ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal, -+ isVolatile, DstPtrInfo, SrcPtrInfo); -+ - unsigned BytesLeft = SizeVal & 3; - unsigned NumMemOps = SizeVal >> 2; - unsigned EmittedNumMemOps = 0; -diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h -index 275b1c0f8dc0..6ff422c15b12 100644 ---- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h -+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h -@@ -44,6 +44,12 @@ public: - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo) const override; - -+ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl, -+ const ARMSubtarget &Subtarget, SDValue Chain, -+ SDValue Dst, SDValue Src, uint64_t SizeVal, -+ bool isVolatile, MachinePointerInfo DstPtrInfo, -+ MachinePointerInfo SrcPtrInfo) const; -+ - SDValue - EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, - SDValue Dst, SDValue Src, SDValue Size, --- -2.34.1 - diff --git a/patches/llvm-project-perf/0002-ARM-Codegen-Set-LDM-STM-inlining-preference-for-v7m.patch b/patches/llvm-project-perf/0002-ARM-Codegen-Set-LDM-STM-inlining-preference-for-v7m.patch new file mode 100644 index 00000000..20dffa45 --- /dev/null +++ b/patches/llvm-project-perf/0002-ARM-Codegen-Set-LDM-STM-inlining-preference-for-v7m.patch @@ -0,0 +1,350 @@ +From 8aa999e52ef03be7d8c05f4bd151d7df60d17e8f Mon Sep 17 00:00:00 2001 +From: Scott Douglass +Date: Tue, 13 Aug 2024 10:55:51 +0100 +Subject: [PATCH] [ARM][CodeGen]Prefer MEMCPY LDM/STM inlining for v7-m + +This patch changes the behaviour of memcpy inlining on v7m targets. +The old behaviour was to inline memcpys with LDM/STM instructions. +Alternatively, using LD/ST instructions for memcpy inlining allowed +for performance gains of 1% to 2% on selected benchmarks. + +Co-authored-by: Nashe Mncube +--- + llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 120 +++++++++++++++ + llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 + + llvm/test/CodeGen/ARM/memcpy-v7m.ll | 161 ++++++++++++++++++++ + 3 files changed, 287 insertions(+) + create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll + +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +index e7ea10ff971a..09ed4ab219a1 100644 +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +@@ -30,6 +30,10 @@ cl::opt EnableMemtransferTPLoop( + "Allow (may be subject to certain conditions) " + "conversion of memcpy to TP loop."))); + ++static cl::opt EnableInlineMemcpyAsLdSt( ++ "enable-inline-memcpy-ld-st", cl::init(false), cl::Hidden, ++ cl::desc("Inline memcpy with LD/ST instructions.")); ++ + // Emit, if possible, a specialized version of the given Libcall. Typically this + // means selecting the appropriately aligned version, but we also convert memset + // of 0 into memclr. +@@ -136,6 +140,118 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( + return CallResult.second; + } + ++SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt( ++ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain, ++ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile, ++ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { ++ // Do repeated batches of 4-byte loads and stores. ++ unsigned BytesLeft = SizeVal & 3; ++ unsigned NumMemOps = SizeVal >> 2; ++ unsigned EmittedNumMemOps = 0; ++ EVT VT = MVT::i32; ++ unsigned VTSize = 4; ++ unsigned I = 0; ++ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers ++ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6; ++ SDValue TFOps[6]; ++ SDValue Loads[6]; ++ uint64_t SrcOff = 0, DstOff = 0; ++ ++ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone; ++ if (isVolatile) ++ MOFlags = MachineMemOperand::Flags::MOVolatile; ++ MachineMemOperand::Flags LoadMOFlags = MOFlags; ++ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), ++ DAG.getDataLayout())) ++ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable; ++ if (auto *V = SrcPtrInfo.V.dyn_cast()) ++ if (isa(V) && cast(V)->isConstant()) ++ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant; ++ MachineMemOperand::Flags StoreMOFlags = MOFlags; ++ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), ++ DAG.getDataLayout())) ++ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable; ++ ++ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the ++ // same number of stores. The loads and stores may get combined into ++ // ldm/stm later on. ++ while (EmittedNumMemOps < NumMemOps) { ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { ++ Loads[I] = DAG.getLoad(VT, dl, Chain, ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, ++ DAG.getConstant(SrcOff, dl, MVT::i32)), ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), ++ LoadMOFlags); ++ TFOps[I] = Loads[I].getValue(1); ++ SrcOff += VTSize; ++ } ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); ++ ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { ++ TFOps[I] = DAG.getStore( ++ Chain, dl, Loads[I], ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, ++ DAG.getConstant(DstOff, dl, MVT::i32)), ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags); ++ DstOff += VTSize; ++ } ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); ++ ++ EmittedNumMemOps += I; ++ } ++ ++ if (BytesLeft == 0) ++ return Chain; ++ ++ // Issue loads / stores for the trailing (1 - 3) bytes. ++ unsigned BytesLeftSave = BytesLeft; ++ I = 0; ++ while (BytesLeft) { ++ if (BytesLeft >= 2) { ++ VT = MVT::i16; ++ VTSize = 2; ++ } else { ++ VT = MVT::i8; ++ VTSize = 1; ++ } ++ ++ Loads[I] = DAG.getLoad(VT, dl, Chain, ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, ++ DAG.getConstant(SrcOff, dl, MVT::i32)), ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), ++ LoadMOFlags); ++ ++ TFOps[I] = Loads[I].getValue(1); ++ ++I; ++ SrcOff += VTSize; ++ BytesLeft -= VTSize; ++ } ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); ++ ++ I = 0; ++ BytesLeft = BytesLeftSave; ++ while (BytesLeft) { ++ if (BytesLeft >= 2) { ++ VT = MVT::i16; ++ VTSize = 2; ++ } else { ++ VT = MVT::i8; ++ VTSize = 1; ++ } ++ ++ TFOps[I] = DAG.getStore(Chain, dl, Loads[I], ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, ++ DAG.getConstant(DstOff, dl, MVT::i32)), ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), ++ StoreMOFlags); ++ ++I; ++ DstOff += VTSize; ++ BytesLeft -= VTSize; ++ } ++ ++ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, I)); ++} ++ + static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, + const SelectionDAG &DAG, + ConstantSDNode *ConstantSize, +@@ -190,6 +306,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMCPY); + ++ if (EnableInlineMemcpyAsLdSt && Subtarget.isMClass() && Subtarget.hasV7Ops()) ++ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal, ++ isVolatile, DstPtrInfo, SrcPtrInfo); ++ + unsigned BytesLeft = SizeVal & 3; + unsigned NumMemOps = SizeVal >> 2; + unsigned EmittedNumMemOps = 0; +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +index 275b1c0f8dc0..6ff422c15b12 100644 +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +@@ -44,6 +44,12 @@ public: + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + ++ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl, ++ const ARMSubtarget &Subtarget, SDValue Chain, ++ SDValue Dst, SDValue Src, uint64_t SizeVal, ++ bool isVolatile, MachinePointerInfo DstPtrInfo, ++ MachinePointerInfo SrcPtrInfo) const; ++ + SDValue + EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, + SDValue Dst, SDValue Src, SDValue Size, +diff --git a/llvm/test/CodeGen/ARM/memcpy-v7m.ll b/llvm/test/CodeGen/ARM/memcpy-v7m.ll +new file mode 100644 +index 000000000000..12f74c04087e +--- /dev/null ++++ b/llvm/test/CodeGen/ARM/memcpy-v7m.ll +@@ -0,0 +1,161 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m7 -verify-machineinstrs -enable-inline-memcpy-ld-st %s -o - | FileCheck %s ++ ++@d = external global [64 x i32] ++@s = external global [64 x i32] ++@d_32 = external global[32 x i32] ++@s_32 = external global[32 x i32] ++ ++ ++define void @t1() #0 { ++; CHECK-LABEL: t1: ++; CHECK: @ %bb.0: @ %entry ++; CHECK-NEXT: movw r0, :lower16:d ++; CHECK-NEXT: movw r2, :lower16:s ++; CHECK-NEXT: movt r0, :upper16:d ++; CHECK-NEXT: movt r2, :upper16:s ++; CHECK-NEXT: ldr r1, [r0] ++; CHECK-NEXT: str r1, [r2] ++; CHECK-NEXT: ldr r3, [r0, #4] ++; CHECK-NEXT: str r3, [r2, #4] ++; CHECK-NEXT: ldr r1, [r0, #8] ++; CHECK-NEXT: ldr r3, [r0, #12] ++; CHECK-NEXT: ldrb r0, [r0, #16] ++; CHECK-NEXT: strd r1, r3, [r2, #8] ++; CHECK-NEXT: strb r0, [r2, #16] ++; CHECK-NEXT: bx lr ++entry: ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false) ++ ret void ++} ++ ++define void @t2() #0 { ++; CHECK-LABEL: t2: ++; CHECK: @ %bb.0: @ %entry ++; CHECK-NEXT: movw r0, :lower16:d ++; CHECK-NEXT: movw r1, :lower16:s ++; CHECK-NEXT: movt r0, :upper16:d ++; CHECK-NEXT: movt r1, :upper16:s ++; CHECK-NEXT: ldr.w r2, [r0, #11] ++; CHECK-NEXT: str.w r2, [r1, #11] ++; CHECK-NEXT: ldr r2, [r0] ++; CHECK-NEXT: str r2, [r1] ++; CHECK-NEXT: ldr r2, [r0, #4] ++; CHECK-NEXT: str r2, [r1, #4] ++; CHECK-NEXT: ldr r0, [r0, #8] ++; CHECK-NEXT: str r0, [r1, #8] ++; CHECK-NEXT: bx lr ++entry: ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false) ++ ret void ++} ++ ++declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 ++ ++ ++define void @t3() #0 { ++; CHECK-LABEL: t3: ++; CHECK: @ %bb.0: ++; CHECK-NEXT: movw r0, :lower16:d_32 ++; CHECK-NEXT: movw r2, :lower16:s_32 ++; CHECK-NEXT: movt r0, :upper16:d_32 ++; CHECK-NEXT: movt r2, :upper16:s_32 ++; CHECK-NEXT: ldr r1, [r0] ++; CHECK-NEXT: str r1, [r2] ++; CHECK-NEXT: ldr r3, [r0, #4] ++; CHECK-NEXT: str r3, [r2, #4] ++; CHECK-NEXT: ldr r1, [r0, #8] ++; CHECK-NEXT: ldr r3, [r0, #12] ++; CHECK-NEXT: ldrb r0, [r0, #16] ++; CHECK-NEXT: strd r1, r3, [r2, #8] ++; CHECK-NEXT: strb r0, [r2, #16] ++; CHECK-NEXT: bx lr ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([32 x i32]* @s_32 to i8*), i8* bitcast ([32 x i32]* @d_32 to i8*), i32 17, i32 4, i1 false) ++ ret void ++} ++ ++define void @t4() #0 { ++; CHECK-LABEL: t4: ++; CHECK: @ %bb.0: ++; CHECK-NEXT: movw r0, :lower16:d_32 ++; CHECK-NEXT: movw r1, :lower16:s_32 ++; CHECK-NEXT: movt r0, :upper16:d_32 ++; CHECK-NEXT: movt r1, :upper16:s_32 ++; CHECK-NEXT: ldr.w r2, [r0, #11] ++; CHECK-NEXT: str.w r2, [r1, #11] ++; CHECK-NEXT: ldr r2, [r0] ++; CHECK-NEXT: str r2, [r1] ++; CHECK-NEXT: ldr r2, [r0, #4] ++; CHECK-NEXT: str r2, [r1, #4] ++; CHECK-NEXT: ldr r0, [r0, #8] ++; CHECK-NEXT: str r0, [r1, #8] ++; CHECK-NEXT: bx lr ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([32 x i32]* @s_32 to i8*), i8* bitcast ([32 x i32]* @d_32 to i8*), i32 15, i32 4, i1 false) ++ ret void ++} ++ ++define void @t5() #0 { ++; CHECK-LABEL: t5: ++; CHECK: @ %bb.0: @ %entry ++; CHECK-NEXT: .save {r4, r5, r7, lr} ++; CHECK-NEXT: push {r4, r5, r7, lr} ++; CHECK-NEXT: movw r0, :lower16:d ++; CHECK-NEXT: movw r1, :lower16:s ++; CHECK-NEXT: movt r0, :upper16:d ++; CHECK-NEXT: movt r1, :upper16:s ++; CHECK-NEXT: ldr r0, [r0] ++; CHECK-NEXT: ldr r1, [r1] ++; CHECK-NEXT: add.w r12, r0, #12 ++; CHECK-NEXT: ldr r3, [r0, #24] ++; CHECK-NEXT: ldrd r2, lr, [r0, #4] ++; CHECK-NEXT: ldm.w r12, {r4, r5, r12} ++; CHECK-NEXT: str r3, [r1, #24] ++; CHECK-NEXT: add.w r3, r1, #12 ++; CHECK-NEXT: strd r2, lr, [r1, #4] ++; CHECK-NEXT: stm.w r3, {r4, r5, r12} ++; CHECK-NEXT: ldr r0, [r0, #28] ++; CHECK-NEXT: str r0, [r1, #28] ++; CHECK-NEXT: pop {r4, r5, r7, pc} ++entry: ++ %0 = load i32*, i32** @s, align 4 ++ %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 ++ %1 = bitcast i32* %arrayidx to i8* ++ %2 = load i32*, i32** @d, align 4 ++ %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 ++ %3 = bitcast i32* %arrayidx1 to i8* ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) ++ ret void ++} ++ ++define void @t6() #0 { ++; CHECK-LABEL: t6: ++; CHECK: @ %bb.0: @ %entry ++; CHECK-NEXT: .save {r4, r5, r7, lr} ++; CHECK-NEXT: push {r4, r5, r7, lr} ++; CHECK-NEXT: movw r0, :lower16:d ++; CHECK-NEXT: movw r1, :lower16:s ++; CHECK-NEXT: movt r0, :upper16:d ++; CHECK-NEXT: movt r1, :upper16:s ++; CHECK-NEXT: ldr r0, [r0] ++; CHECK-NEXT: ldr r1, [r1] ++; CHECK-NEXT: add.w r12, r0, #12 ++; CHECK-NEXT: ldr r3, [r0, #24] ++; CHECK-NEXT: ldrd r2, lr, [r0, #4] ++; CHECK-NEXT: ldm.w r12, {r4, r5, r12} ++; CHECK-NEXT: str r3, [r1, #24] ++; CHECK-NEXT: add.w r3, r1, #12 ++; CHECK-NEXT: strd r2, lr, [r1, #4] ++; CHECK-NEXT: stm.w r3, {r4, r5, r12} ++; CHECK-NEXT: ldr r0, [r0, #28] ++; CHECK-NEXT: str r0, [r1, #28] ++; CHECK-NEXT: pop {r4, r5, r7, pc} ++entry: ++ %0 = load i32*, i32** @s, align 8 ++ %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 ++ %1 = bitcast i32* %arrayidx to i8* ++ %2 = load i32*, i32** @d, align 8 ++ %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 ++ %3 = bitcast i32* %arrayidx1 to i8* ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) ++ ret void ++} +-- +2.34.1 +