|
| 1 | +From 61af6af10d10a08b81d3924fa5b35bfb548b2a05 Mon Sep 17 00:00:00 2001 |
| 2 | +From: nasmnc01 < [email protected]> |
| 3 | +Date: Tue, 13 Aug 2024 10:55:51 +0100 |
| 4 | +Subject: [PATCH] [ARM][CodeGen] Disable MEMCPY LDM/STM inlining for v7-m |
| 5 | + |
| 6 | +This patch disables the expansion of MEMCPY to LDM/STM |
| 7 | +on v7-m targets. This is due to a slowdown caused |
| 8 | +by this inlining method. |
| 9 | + |
| 10 | +Change-Id: I91095299c2c67670a16849d08540bdbc07a95adc |
| 11 | +--- |
| 12 | + llvm/lib/Target/ARM/ARMFeatures.td | 5 + |
| 13 | + llvm/lib/Target/ARM/ARMProcessors.td | 2 +- |
| 14 | + llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 121 ++++++++++++++ |
| 15 | + llvm/lib/Target/ARM/ARMSelectionDAGInfo.h | 6 + |
| 16 | + llvm/lib/Target/ARM/ARMSubtarget.h | 2 + |
| 17 | + llvm/test/CodeGen/ARM/memcpy-v7m.ll | 165 ++++++++++++++++++++ |
| 18 | + 6 files changed, 300 insertions(+), 1 deletion(-) |
| 19 | + create mode 100644 llvm/test/CodeGen/ARM/memcpy-v7m.ll |
| 20 | + |
| 21 | +diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td |
| 22 | +index bb437698296c..f7fa00aba424 100644 |
| 23 | +--- a/llvm/lib/Target/ARM/ARMFeatures.td |
| 24 | ++++ b/llvm/lib/Target/ARM/ARMFeatures.td |
| 25 | +@@ -510,6 +510,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", |
| 26 | + "DisablePostRAScheduler", "true", |
| 27 | + "Don't schedule again after register allocation">; |
| 28 | + |
| 29 | ++def FeatureUseInlineMemcpyAsLdSt : |
| 30 | ++ SubtargetFeature<"use-inline-memcpy-ldst", "UseInlineMemcpyAsLdSt", |
| 31 | ++ "true", "Use memcpy inlining as LD/ST instructions">; |
| 32 | ++ |
| 33 | ++ |
| 34 | + // Armv8.5-A extensions |
| 35 | + |
| 36 | + // Has speculation barrier. |
| 37 | +diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td |
| 38 | +index b94a5fc16146..ffb0c86bc687 100644 |
| 39 | +--- a/llvm/lib/Target/ARM/ARMProcessors.td |
| 40 | ++++ b/llvm/lib/Target/ARM/ARMProcessors.td |
| 41 | +@@ -96,7 +96,7 @@ def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus", |
| 42 | + def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", |
| 43 | + "Cortex-M3 ARM processors", []>; |
| 44 | + def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", |
| 45 | +- "Cortex-M7 ARM processors", []>; |
| 46 | ++ "Cortex-M7 ARM processors", [FeatureUseInlineMemcpyAsLdSt]>; |
| 47 | + |
| 48 | + //===----------------------------------------------------------------------===// |
| 49 | + // ARM processors |
| 50 | +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 51 | +index c57825949c1c..12db2ab1fca2 100644 |
| 52 | +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 53 | ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |
| 54 | +@@ -12,6 +12,7 @@ |
| 55 | + |
| 56 | + #include "ARMTargetMachine.h" |
| 57 | + #include "ARMTargetTransformInfo.h" |
| 58 | ++#include "llvm/ADT/SmallVector.h" |
| 59 | + #include "llvm/CodeGen/SelectionDAG.h" |
| 60 | + #include "llvm/IR/DerivedTypes.h" |
| 61 | + #include "llvm/Support/CommandLine.h" |
| 62 | +@@ -138,6 +139,122 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( |
| 63 | + return CallResult.second; |
| 64 | + } |
| 65 | + |
| 66 | ++SDValue ARMSelectionDAGInfo::EmitMemcpyAsLdSt( |
| 67 | ++ SelectionDAG &DAG, SDLoc dl, const ARMSubtarget &Subtarget, SDValue Chain, |
| 68 | ++ SDValue Dst, SDValue Src, uint64_t SizeVal, bool isVolatile, |
| 69 | ++ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
| 70 | ++ // Do repeated batches of 4-byte loads and stores. |
| 71 | ++ unsigned BytesLeft = SizeVal & 3; |
| 72 | ++ unsigned NumMemOps = SizeVal >> 2; |
| 73 | ++ unsigned EmittedNumMemOps = 0; |
| 74 | ++ EVT VT = MVT::i32; |
| 75 | ++ unsigned VTSize = 4; |
| 76 | ++ unsigned I = 0; |
| 77 | ++ // Emit a maximum of 4 loads in Thumb1 since we have fewer registers |
| 78 | ++ const unsigned MaxLoads = Subtarget.isThumb1Only() ? 4 : 6; |
| 79 | ++ SmallVector<SDValue> TFOps(6); |
| 80 | ++ SmallVector<SDValue> Loads(6); |
| 81 | ++ uint64_t SrcOff = 0, DstOff = 0; |
| 82 | ++ |
| 83 | ++ MachineMemOperand::Flags MOFlags = MachineMemOperand::Flags::MONone; |
| 84 | ++ if (isVolatile) |
| 85 | ++ MOFlags = MachineMemOperand::Flags::MOVolatile; |
| 86 | ++ MachineMemOperand::Flags LoadMOFlags = MOFlags; |
| 87 | ++ if (SrcPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), |
| 88 | ++ DAG.getDataLayout())) |
| 89 | ++ LoadMOFlags |= MachineMemOperand::Flags::MODereferenceable; |
| 90 | ++ if (auto *V = SrcPtrInfo.V.dyn_cast<const Value *>()) |
| 91 | ++ if (isa<GlobalVariable>(V) && cast<GlobalVariable>(V)->isConstant()) |
| 92 | ++ LoadMOFlags |= MachineMemOperand::Flags::MOInvariant; |
| 93 | ++ MachineMemOperand::Flags StoreMOFlags = MOFlags; |
| 94 | ++ if (DstPtrInfo.isDereferenceable(SizeVal, *DAG.getContext(), |
| 95 | ++ DAG.getDataLayout())) |
| 96 | ++ StoreMOFlags |= MachineMemOperand::Flags::MODereferenceable; |
| 97 | ++ |
| 98 | ++ // Emit up to MaxLoads loads, then a TokenFactor barrier, then the |
| 99 | ++ // same number of stores. The loads and stores may get combined into |
| 100 | ++ // ldm/stm later on. |
| 101 | ++ while (EmittedNumMemOps < NumMemOps) { |
| 102 | ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { |
| 103 | ++ Loads[I] = DAG.getLoad(VT, dl, Chain, |
| 104 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, |
| 105 | ++ DAG.getConstant(SrcOff, dl, MVT::i32)), |
| 106 | ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), |
| 107 | ++ LoadMOFlags); |
| 108 | ++ TFOps[I] = Loads[I].getValue(1); |
| 109 | ++ SrcOff += VTSize; |
| 110 | ++ } |
| 111 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, |
| 112 | ++ ArrayRef(TFOps.data(), I)); |
| 113 | ++ |
| 114 | ++ for (I = 0; I < MaxLoads && EmittedNumMemOps + I < NumMemOps; ++I) { |
| 115 | ++ TFOps[I] = DAG.getStore( |
| 116 | ++ Chain, dl, Loads[I], |
| 117 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, |
| 118 | ++ DAG.getConstant(DstOff, dl, MVT::i32)), |
| 119 | ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), StoreMOFlags); |
| 120 | ++ DstOff += VTSize; |
| 121 | ++ } |
| 122 | ++ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, |
| 123 | ++ ArrayRef(TFOps.data(), I)); |
| 124 | ++ |
| 125 | ++ EmittedNumMemOps += I; |
| 126 | ++ } |
| 127 | ++ |
| 128 | ++ if (BytesLeft == 0) |
| 129 | ++ return Chain; |
| 130 | ++ |
| 131 | ++ // Issue loads / stores for the trailing (1 - 3) bytes. |
| 132 | ++ unsigned BytesLeftSave = BytesLeft; |
| 133 | ++ I = 0; |
| 134 | ++ while (BytesLeft) { |
| 135 | ++ if (BytesLeft >= 2) { |
| 136 | ++ VT = MVT::i16; |
| 137 | ++ VTSize = 2; |
| 138 | ++ } else { |
| 139 | ++ VT = MVT::i8; |
| 140 | ++ VTSize = 1; |
| 141 | ++ } |
| 142 | ++ |
| 143 | ++ Loads[I] = DAG.getLoad(VT, dl, Chain, |
| 144 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Src, |
| 145 | ++ DAG.getConstant(SrcOff, dl, MVT::i32)), |
| 146 | ++ SrcPtrInfo.getWithOffset(SrcOff), MaybeAlign(0), |
| 147 | ++ LoadMOFlags); |
| 148 | ++ |
| 149 | ++ TFOps[I] = Loads[I].getValue(1); |
| 150 | ++ ++I; |
| 151 | ++ SrcOff += VTSize; |
| 152 | ++ BytesLeft -= VTSize; |
| 153 | ++ } |
| 154 | ++ Chain = |
| 155 | ++ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps.data(), I)); |
| 156 | ++ |
| 157 | ++ I = 0; |
| 158 | ++ BytesLeft = BytesLeftSave; |
| 159 | ++ while (BytesLeft) { |
| 160 | ++ if (BytesLeft >= 2) { |
| 161 | ++ VT = MVT::i16; |
| 162 | ++ VTSize = 2; |
| 163 | ++ } else { |
| 164 | ++ VT = MVT::i8; |
| 165 | ++ VTSize = 1; |
| 166 | ++ } |
| 167 | ++ |
| 168 | ++ TFOps[I] = DAG.getStore(Chain, dl, Loads[I], |
| 169 | ++ DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, |
| 170 | ++ DAG.getConstant(DstOff, dl, MVT::i32)), |
| 171 | ++ DstPtrInfo.getWithOffset(DstOff), MaybeAlign(0), |
| 172 | ++ StoreMOFlags); |
| 173 | ++ ++I; |
| 174 | ++ DstOff += VTSize; |
| 175 | ++ BytesLeft -= VTSize; |
| 176 | ++ } |
| 177 | ++ |
| 178 | ++ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, |
| 179 | ++ ArrayRef(TFOps.data(), I)); |
| 180 | ++} |
| 181 | ++ |
| 182 | + static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, |
| 183 | + const SelectionDAG &DAG, |
| 184 | + ConstantSDNode *ConstantSize, |
| 185 | +@@ -192,6 +309,10 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( |
| 186 | + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
| 187 | + Alignment.value(), RTLIB::MEMCPY); |
| 188 | + |
| 189 | ++ if (Subtarget.allowInlineMemcpyAsLdSt()) |
| 190 | ++ return EmitMemcpyAsLdSt(DAG, dl, Subtarget, Chain, Dst, Src, SizeVal, |
| 191 | ++ isVolatile, DstPtrInfo, SrcPtrInfo); |
| 192 | ++ |
| 193 | + unsigned BytesLeft = SizeVal & 3; |
| 194 | + unsigned NumMemOps = SizeVal >> 2; |
| 195 | + unsigned EmittedNumMemOps = 0; |
| 196 | +diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 197 | +index 275b1c0f8dc0..6ff422c15b12 100644 |
| 198 | +--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 199 | ++++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h |
| 200 | +@@ -44,6 +44,12 @@ public: |
| 201 | + MachinePointerInfo DstPtrInfo, |
| 202 | + MachinePointerInfo SrcPtrInfo) const override; |
| 203 | + |
| 204 | ++ SDValue EmitMemcpyAsLdSt(SelectionDAG &DAG, SDLoc dl, |
| 205 | ++ const ARMSubtarget &Subtarget, SDValue Chain, |
| 206 | ++ SDValue Dst, SDValue Src, uint64_t SizeVal, |
| 207 | ++ bool isVolatile, MachinePointerInfo DstPtrInfo, |
| 208 | ++ MachinePointerInfo SrcPtrInfo) const; |
| 209 | ++ |
| 210 | + SDValue |
| 211 | + EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, |
| 212 | + SDValue Dst, SDValue Src, SDValue Size, |
| 213 | +diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h |
| 214 | +index 2f7af05a259f..0acf919b1360 100644 |
| 215 | +--- a/llvm/lib/Target/ARM/ARMSubtarget.h |
| 216 | ++++ b/llvm/lib/Target/ARM/ARMSubtarget.h |
| 217 | +@@ -523,6 +523,8 @@ public: |
| 218 | + bool ignoreCSRForAllocationOrder(const MachineFunction &MF, |
| 219 | + unsigned PhysReg) const override; |
| 220 | + unsigned getGPRAllocationOrder(const MachineFunction &MF) const; |
| 221 | ++ |
| 222 | ++ bool allowInlineMemcpyAsLdSt() const { return UseInlineMemcpyAsLdSt; } |
| 223 | + }; |
| 224 | + |
| 225 | + } // end namespace llvm |
| 226 | +diff --git a/llvm/test/CodeGen/ARM/memcpy-v7m.ll b/llvm/test/CodeGen/ARM/memcpy-v7m.ll |
| 227 | +new file mode 100644 |
| 228 | +index 000000000000..2a90f44fe3d3 |
| 229 | +--- /dev/null |
| 230 | ++++ b/llvm/test/CodeGen/ARM/memcpy-v7m.ll |
| 231 | +@@ -0,0 +1,165 @@ |
| 232 | ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| 233 | ++; RUN: llc -mtriple=thumbv7em-eabi -mcpu=cortex-m7 -verify-machineinstrs %s -o - | FileCheck %s |
| 234 | ++ |
| 235 | ++@d = external global [64 x i32] |
| 236 | ++@s = external global [64 x i32] |
| 237 | ++@d_32 = external global[32 x i32] |
| 238 | ++@s_32 = external global[32 x i32] |
| 239 | ++ |
| 240 | ++ |
| 241 | ++; Function Attrs: nounwind |
| 242 | ++define void @t1() #0 { |
| 243 | ++; CHECK-LABEL: t1: |
| 244 | ++; CHECK: @ %bb.0: @ %entry |
| 245 | ++; CHECK-NEXT: movw r0, :lower16:d |
| 246 | ++; CHECK-NEXT: movw r2, :lower16:s |
| 247 | ++; CHECK-NEXT: movt r0, :upper16:d |
| 248 | ++; CHECK-NEXT: movt r2, :upper16:s |
| 249 | ++; CHECK-NEXT: ldr r1, [r0] |
| 250 | ++; CHECK-NEXT: str r1, [r2] |
| 251 | ++; CHECK-NEXT: ldr r3, [r0, #4] |
| 252 | ++; CHECK-NEXT: str r3, [r2, #4] |
| 253 | ++; CHECK-NEXT: ldr r1, [r0, #8] |
| 254 | ++; CHECK-NEXT: ldr r3, [r0, #12] |
| 255 | ++; CHECK-NEXT: ldrb r0, [r0, #16] |
| 256 | ++; CHECK-NEXT: strd r1, r3, [r2, #8] |
| 257 | ++; CHECK-NEXT: strb r0, [r2, #16] |
| 258 | ++; CHECK-NEXT: bx lr |
| 259 | ++entry: |
| 260 | ++; We use '[rl0-9]+' to allow 'r0'..'r12', 'lr' |
| 261 | ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false) |
| 262 | ++ ret void |
| 263 | ++} |
| 264 | ++ |
| 265 | ++; Function Attrs: nounwind |
| 266 | ++define void @t2() #0 { |
| 267 | ++; CHECK-LABEL: t2: |
| 268 | ++; CHECK: @ %bb.0: @ %entry |
| 269 | ++; CHECK-NEXT: movw r0, :lower16:d |
| 270 | ++; CHECK-NEXT: movw r1, :lower16:s |
| 271 | ++; CHECK-NEXT: movt r0, :upper16:d |
| 272 | ++; CHECK-NEXT: movt r1, :upper16:s |
| 273 | ++; CHECK-NEXT: ldr.w r2, [r0, #11] |
| 274 | ++; CHECK-NEXT: str.w r2, [r1, #11] |
| 275 | ++; CHECK-NEXT: ldr r2, [r0] |
| 276 | ++; CHECK-NEXT: str r2, [r1] |
| 277 | ++; CHECK-NEXT: ldr r2, [r0, #4] |
| 278 | ++; CHECK-NEXT: str r2, [r1, #4] |
| 279 | ++; CHECK-NEXT: ldr r0, [r0, #8] |
| 280 | ++; CHECK-NEXT: str r0, [r1, #8] |
| 281 | ++; CHECK-NEXT: bx lr |
| 282 | ++entry: |
| 283 | ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false) |
| 284 | ++ ret void |
| 285 | ++} |
| 286 | ++ |
| 287 | ++; Function Attrs: nounwind |
| 288 | ++declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 |
| 289 | ++ |
| 290 | ++ |
| 291 | ++define void @t3() #0 { |
| 292 | ++; CHECK-LABEL: t3: |
| 293 | ++; CHECK: @ %bb.0: |
| 294 | ++; CHECK-NEXT: movw r0, :lower16:d_32 |
| 295 | ++; CHECK-NEXT: movw r2, :lower16:s_32 |
| 296 | ++; CHECK-NEXT: movt r0, :upper16:d_32 |
| 297 | ++; CHECK-NEXT: movt r2, :upper16:s_32 |
| 298 | ++; CHECK-NEXT: ldr r1, [r0] |
| 299 | ++; CHECK-NEXT: str r1, [r2] |
| 300 | ++; CHECK-NEXT: ldr r3, [r0, #4] |
| 301 | ++; CHECK-NEXT: str r3, [r2, #4] |
| 302 | ++; CHECK-NEXT: ldr r1, [r0, #8] |
| 303 | ++; CHECK-NEXT: ldr r3, [r0, #12] |
| 304 | ++; CHECK-NEXT: ldrb r0, [r0, #16] |
| 305 | ++; CHECK-NEXT: strd r1, r3, [r2, #8] |
| 306 | ++; CHECK-NEXT: strb r0, [r2, #16] |
| 307 | ++; CHECK-NEXT: bx lr |
| 308 | ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([32 x i32]* @s_32 to i8*), i8* bitcast ([32 x i32]* @d_32 to i8*), i32 17, i32 4, i1 false) |
| 309 | ++ ret void |
| 310 | ++} |
| 311 | ++ |
| 312 | ++define void @t4() #0 { |
| 313 | ++; CHECK-LABEL: t4: |
| 314 | ++; CHECK: @ %bb.0: |
| 315 | ++; CHECK-NEXT: movw r0, :lower16:d_32 |
| 316 | ++; CHECK-NEXT: movw r1, :lower16:s_32 |
| 317 | ++; CHECK-NEXT: movt r0, :upper16:d_32 |
| 318 | ++; CHECK-NEXT: movt r1, :upper16:s_32 |
| 319 | ++; CHECK-NEXT: ldr.w r2, [r0, #11] |
| 320 | ++; CHECK-NEXT: str.w r2, [r1, #11] |
| 321 | ++; CHECK-NEXT: ldr r2, [r0] |
| 322 | ++; CHECK-NEXT: str r2, [r1] |
| 323 | ++; CHECK-NEXT: ldr r2, [r0, #4] |
| 324 | ++; CHECK-NEXT: str r2, [r1, #4] |
| 325 | ++; CHECK-NEXT: ldr r0, [r0, #8] |
| 326 | ++; CHECK-NEXT: str r0, [r1, #8] |
| 327 | ++; CHECK-NEXT: bx lr |
| 328 | ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([32 x i32]* @s_32 to i8*), i8* bitcast ([32 x i32]* @d_32 to i8*), i32 15, i32 4, i1 false) |
| 329 | ++ ret void |
| 330 | ++} |
| 331 | ++ |
| 332 | ++define void @t5() #0 { |
| 333 | ++; CHECK-LABEL: t5: |
| 334 | ++; CHECK: @ %bb.0: @ %entry |
| 335 | ++; CHECK-NEXT: .save {r4, r5, r7, lr} |
| 336 | ++; CHECK-NEXT: push {r4, r5, r7, lr} |
| 337 | ++; CHECK-NEXT: movw r0, :lower16:d |
| 338 | ++; CHECK-NEXT: movw r1, :lower16:s |
| 339 | ++; CHECK-NEXT: movt r0, :upper16:d |
| 340 | ++; CHECK-NEXT: movt r1, :upper16:s |
| 341 | ++; CHECK-NEXT: ldr r0, [r0] |
| 342 | ++; CHECK-NEXT: ldr r1, [r1] |
| 343 | ++; CHECK-NEXT: add.w r12, r0, #12 |
| 344 | ++; CHECK-NEXT: ldr r3, [r0, #24] |
| 345 | ++; CHECK-NEXT: ldrd r2, lr, [r0, #4] |
| 346 | ++; CHECK-NEXT: ldm.w r12, {r4, r5, r12} |
| 347 | ++; CHECK-NEXT: str r3, [r1, #24] |
| 348 | ++; CHECK-NEXT: add.w r3, r1, #12 |
| 349 | ++; CHECK-NEXT: strd r2, lr, [r1, #4] |
| 350 | ++; CHECK-NEXT: stm.w r3, {r4, r5, r12} |
| 351 | ++; CHECK-NEXT: ldr r0, [r0, #28] |
| 352 | ++; CHECK-NEXT: str r0, [r1, #28] |
| 353 | ++; CHECK-NEXT: pop {r4, r5, r7, pc} |
| 354 | ++entry: |
| 355 | ++ %0 = load i32*, i32** @s, align 4 |
| 356 | ++ %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 |
| 357 | ++ %1 = bitcast i32* %arrayidx to i8* |
| 358 | ++ %2 = load i32*, i32** @d, align 4 |
| 359 | ++ %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 |
| 360 | ++ %3 = bitcast i32* %arrayidx1 to i8* |
| 361 | ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) |
| 362 | ++ ret void |
| 363 | ++} |
| 364 | ++ |
| 365 | ++define void @t6() #0 { |
| 366 | ++; CHECK-LABEL: t6: |
| 367 | ++; CHECK: @ %bb.0: @ %entry |
| 368 | ++; CHECK-NEXT: .save {r4, r5, r7, lr} |
| 369 | ++; CHECK-NEXT: push {r4, r5, r7, lr} |
| 370 | ++; CHECK-NEXT: movw r0, :lower16:d |
| 371 | ++; CHECK-NEXT: movw r1, :lower16:s |
| 372 | ++; CHECK-NEXT: movt r0, :upper16:d |
| 373 | ++; CHECK-NEXT: movt r1, :upper16:s |
| 374 | ++; CHECK-NEXT: ldr r0, [r0] |
| 375 | ++; CHECK-NEXT: ldr r1, [r1] |
| 376 | ++; CHECK-NEXT: add.w r12, r0, #12 |
| 377 | ++; CHECK-NEXT: ldr r3, [r0, #24] |
| 378 | ++; CHECK-NEXT: ldrd r2, lr, [r0, #4] |
| 379 | ++; CHECK-NEXT: ldm.w r12, {r4, r5, r12} |
| 380 | ++; CHECK-NEXT: str r3, [r1, #24] |
| 381 | ++; CHECK-NEXT: add.w r3, r1, #12 |
| 382 | ++; CHECK-NEXT: strd r2, lr, [r1, #4] |
| 383 | ++; CHECK-NEXT: stm.w r3, {r4, r5, r12} |
| 384 | ++; CHECK-NEXT: ldr r0, [r0, #28] |
| 385 | ++; CHECK-NEXT: str r0, [r1, #28] |
| 386 | ++; CHECK-NEXT: pop {r4, r5, r7, pc} |
| 387 | ++entry: |
| 388 | ++ %0 = load i32*, i32** @s, align 8 |
| 389 | ++ %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 |
| 390 | ++ %1 = bitcast i32* %arrayidx to i8* |
| 391 | ++ %2 = load i32*, i32** @d, align 8 |
| 392 | ++ %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 |
| 393 | ++ %3 = bitcast i32* %arrayidx1 to i8* |
| 394 | ++ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) |
| 395 | ++ ret void |
| 396 | ++} |
| 397 | +-- |
| 398 | +2.34.1 |
| 399 | + |
0 commit comments