diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f238aefef4400..1e21b6b79ed12 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1432,12 +1432,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::v2i16, Custom); setOperationAction(ISD::BITCAST, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom); // ADDP custom lowering for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) @@ -6402,8 +6414,34 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend, return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2; } +/// Helper function to check if a small vector load can be optimized. +static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD, + const AArch64Subtarget &Subtarget) { + if (!Subtarget.isNeonAvailable()) + return false; + if (LD->isVolatile()) + return false; + + EVT MemVT = LD->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16) + return false; + + Align Alignment = LD->getAlign(); + Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue()); + if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment) + return false; + + return true; +} + bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT ExtVT = ExtVal.getValueType(); + // Small, illegal vectors can be extended inreg. + if (auto *Load = dyn_cast(ExtVal.getOperand(0))) { + if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 && + isEligibleForSmallVectorLoadOpt(Load, *Subtarget)) + return true; + } if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors()) return false; @@ -6859,12 +6897,86 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op, return Result; } +/// Helper function to optimize loads of extended small vectors. +/// These patterns would otherwise get scalarized into inefficient sequences. +static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG) { + const AArch64Subtarget &Subtarget = DAG.getSubtarget(); + if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget)) + return SDValue(); + + EVT MemVT = Load->getMemoryVT(); + EVT ResVT = Load->getValueType(0); + unsigned NumElts = ResVT.getVectorNumElements(); + unsigned DstEltBits = ResVT.getScalarSizeInBits(); + unsigned SrcEltBits = MemVT.getScalarSizeInBits(); + + unsigned ExtOpcode; + switch (Load->getExtensionType()) { + case ISD::EXTLOAD: + case ISD::ZEXTLOAD: + ExtOpcode = ISD::ZERO_EXTEND; + break; + case ISD::SEXTLOAD: + ExtOpcode = ISD::SIGN_EXTEND; + break; + case ISD::NON_EXTLOAD: + return SDValue(); + } + + SDLoc DL(Load); + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + const MachinePointerInfo &PtrInfo = Load->getPointerInfo(); + Align Alignment = Load->getAlign(); + + // Load the data as an FP scalar to avoid issues with integer loads. + unsigned LoadBits = MemVT.getStoreSizeInBits(); + MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits); + SDValue ScalarLoad = + DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment); + + MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits); + SDValue ScalarToVec = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad); + MVT BitcastTy = + MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits); + SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec); + + SDValue Res = Bitcast; + unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits(); + unsigned CurrentNumElts = Res.getValueType().getVectorNumElements(); + while (CurrentEltBits < DstEltBits) { + if (Res.getValueSizeInBits() >= 128) { + CurrentNumElts = CurrentNumElts / 2; + MVT ExtractVT = + MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res, + DAG.getConstant(0, DL, MVT::i64)); + } + CurrentEltBits = CurrentEltBits * 2; + MVT ExtVT = + MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts); + Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res); + } + + if (CurrentNumElts != NumElts) { + MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res, + DAG.getConstant(0, DL, MVT::i64)); + } + + return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL); +} + SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *LoadNode = cast(Op); assert(LoadNode && "Expected custom lowering of a load node"); + if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG)) + return Result; + if (LoadNode->getMemoryVT() == MVT::i64x8) { SmallVector Ops; SDValue Base = LoadNode->getBasePtr(); @@ -6883,37 +6995,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, return DAG.getMergeValues({Loaded, Chain}, DL); } - // Custom lowering for extending v4i8 vector loads. - EVT VT = Op->getValueType(0); - assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); - - if (LoadNode->getMemoryVT() != MVT::v4i8) - return SDValue(); - - // Avoid generating unaligned loads. - if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4)) - return SDValue(); - - unsigned ExtType; - if (LoadNode->getExtensionType() == ISD::SEXTLOAD) - ExtType = ISD::SIGN_EXTEND; - else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || - LoadNode->getExtensionType() == ISD::EXTLOAD) - ExtType = ISD::ZERO_EXTEND; - else - return SDValue(); - - SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), - LoadNode->getBasePtr(), MachinePointerInfo()); - SDValue Chain = Load.getValue(1); - SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); - SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); - SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); - Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, - DAG.getConstant(0, DL, MVT::i64)); - if (VT == MVT::v4i32) - Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); - return DAG.getMergeValues({Ext, Chain}, DL); + return SDValue(); } SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll index 317feb5ad9ad0..0ef2b31d00daa 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll @@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) { define <2 x i16> @test1(ptr %v2i16_ptr) { ; CHECK-LE-LABEL: test1: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-LE-NEXT: add x8, x0, #2 -; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test1: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-BE-NEXT: add x8, x0, #2 -; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %v2i16 = load <2 x i16>, ptr %v2i16_ptr @@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) { define <2 x i8> @test3(ptr %v2i8_ptr) { ; CHECK-LE-LABEL: test3: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-LE-NEXT: add x8, x0, #1 -; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test3: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-BE-NEXT: add x8, x0, #1 -; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %v2i8 = load <2 x i8>, ptr %v2i8_ptr @@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) { define <2 x i32> @fsext_v2i32(ptr %a) { ; CHECK-LE-LABEL: fsext_v2i32: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] -; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i32: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] -; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, ptr %a @@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) { define <2 x i16> @fsext_v2i16(ptr %a) { ; CHECK-LE-LABEL: fsext_v2i16: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] -; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i16: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] -; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, ptr %a @@ -497,3 +495,213 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric %v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1 ret <4 x i8> %v4i8 } + +define <2 x i16> @zext_v2i8_v2i16(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i16> + ret <2 x i16> %y +} + +define <2 x i32> @zext_v2i8_v2i32(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @zext_v2i8_v2i64(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i32> @zext_v2i16_v2i32(ptr %a) { +; CHECK-LE-LABEL: zext_v2i16_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i16_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = zext <2 x i16> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @zext_v2i16_v2i64(ptr %a) { +; CHECK-LE-LABEL: zext_v2i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = zext <2 x i16> %x to <2 x i64> + ret <2 x i64> %y +} + +define <4 x i32> @zext_v4i16_v4i32(ptr %a) { +; CHECK-LE-LABEL: zext_v4i16_v4i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr d0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v4i16_v4i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.4h }, [x0] +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.4s, v0.4s +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <4 x i16>, ptr %a + %y = zext <4 x i16> %x to <4 x i32> + ret <4 x i32> %y +} + +define <2 x i64> @sext_v2i8_v2i64(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i32> @sext_v2i16_v2i32(ptr %a) { +; CHECK-LE-LABEL: sext_v2i16_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i16_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = sext <2 x i16> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @sext_v2i16_v2i64(ptr %a) { +; CHECK-LE-LABEL: sext_v2i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = sext <2 x i16> %x to <2 x i64> + ret <2 x i64> %y +} + +define <4 x i32> @sext_v4i16_v4i32(ptr %a) { +; CHECK-LE-LABEL: sext_v4i16_v4i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr d0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v4i16_v4i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.4h }, [x0] +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.4s, v0.4s +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <4 x i16>, ptr %a + %y = sext <4 x i16> %x to <4 x i32> + ret <4 x i32> %y +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 5ea4678ebac7b..3bfeefd16b814 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -222,21 +222,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: ; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: ldrh w8, [x0, #2] -; CHECK-NEON-NEXT: ldr h0, [x0] +; CHECK-NEON-NEXT: ldr s0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] -; CHECK-NEON-NEXT: mov v0.d[1], x8 -; CHECK-NEON-NEXT: xtn v0.2s, v0.2d +; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: ldrh w8, [x0, #2] -; CHECK-SVE-NEXT: ldr h0, [x0] +; CHECK-SVE-NEXT: ldr s0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] -; CHECK-SVE-NEXT: mov v0.d[1], x8 -; CHECK-SVE-NEXT: xtn v0.2s, v0.2d +; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index 96168cb80196f..7502db4c5aa93 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -56,13 +56,11 @@ entry: define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] @@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -228,13 +225,9 @@ entry: define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index a7875dbebd0e6..d8d003c85eed6 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -176,12 +176,12 @@ entry: define void @and_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: and_v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] @@ -212,12 +212,12 @@ entry: define void @or_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: or_v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] @@ -248,12 +248,12 @@ entry: define void @xor_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: xor_v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] @@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -698,12 +695,10 @@ entry: define void @and_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: and_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] @@ -734,12 +729,10 @@ entry: define void @or_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: or_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] @@ -770,12 +763,10 @@ entry: define void @xor_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: xor_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index d2f72ecacc86c..1c9201d007eab 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: add x8, sp, #12 ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: str s0, [sp, #12] -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x8] -; CHECK-SD-NEXT: orr x8, x8, #0x2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 04124609eec74..b1b869ec9e1ff 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -6,11 +6,10 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x0, #1] +; CHECK-SD-NEXT: ldr h1, [x0] ; CHECK-SD-NEXT: movi v0.2s, #24 -; CHECK-SD-NEXT: fmov s1, w8 -; CHECK-SD-NEXT: mov v1.s[1], w9 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] @@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -145,11 +143,9 @@ entry: define void @v2i16(ptr %p1) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x0, #2] +; CHECK-SD-NEXT: ldr s1, [x0] ; CHECK-SD-NEXT: movi v0.2s, #16 -; CHECK-SD-NEXT: fmov s1, w8 -; CHECK-SD-NEXT: mov v1.s[1], w9 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index c739be95cd243..015fc0ea4cb44 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -6,10 +6,9 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x0, #1] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h @@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -143,10 +141,8 @@ entry: define void @v2i16(ptr %p1) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x0, #2] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index fc9bf2c0aca65..c9181b4c312d1 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -6,10 +6,10 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: ldr h0, [x0] ; CHECK-SD-NEXT: movi v1.2s, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: orr v0.2s, #1, lsl #8 ; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b @@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -219,10 +218,9 @@ entry: define void @v2i16(ptr %p1) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: movi v1.2s, #1 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: orr v0.2s, #1, lsl #16 ; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index cabb0e7278e40..d18cff51c6101 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) { define <2 x i16> @std_v2i8_v2i16(ptr %p) { ; CHECK-LABEL: std_v2i8_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: ldr h0, [x0, #2] +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: shl v0.2s, v0.2s, #3 -; CHECK-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %l1 = load <2 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 2 @@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldr s1, [x0, #4] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ldr s0, [x0, #4] +; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %l1b = load volatile float, ptr %p diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index c4bb6e37d6eaf..b138fa4085427 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) { define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) { ; CHECK-SD-LABEL: load_v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; @@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) { define <2 x i16> @load_v2i16(ptr %ptr) { ; CHECK-SD-LABEL: load_v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 9c69a6f03b858..475bd22c6ebcb 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -68,13 +68,11 @@ entry: define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] @@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -240,13 +237,9 @@ entry: define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 1c4a504d0ab70..b31a5ea0b5d79 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s @@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll index 3e708b0678fbc..297b25ed075e4 100644 --- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll @@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) { ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB3_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8, lsl #1 -; CHECK-NEXT: ldrsb w10, [x9] -; CHECK-NEXT: ldrsb w9, [x9, #1] -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ldr h0, [x0, x8, lsl #1] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: str q0, [x1, x8, lsl #4] diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 3af858713525b..02eb40b412efd 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s @@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 3a9f12b838702..1dc55fccc3dac 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: strb w2, [x3, #2] ; CHECK-SD-NEXT: mov v0.h[1], w1 ; CHECK-SD-NEXT: mov v0.h[2], w2 ; CHECK-SD-NEXT: xtn v0.8b, v0.8h -; CHECK-SD-NEXT: str s0, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] -; CHECK-SD-NEXT: strb w2, [x3, #2] -; CHECK-SD-NEXT: strh w8, [x3] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: str h0, [x3] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 5e278d59b6591..dd920b98e18eb 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -56,13 +56,11 @@ entry: define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] @@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -228,13 +225,9 @@ entry: define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll index f6ed2e6a787f0..33669f5697575 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -7,8 +7,10 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 { ; CHECK-LABEL: load_zext_v4i16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %ap %val = zext <4 x i16> %a to <4 x i32> @@ -99,8 +101,10 @@ define void @load_zext_v64i16i32(ptr %ap, ptr %b) #0 { define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 { ; CHECK-LABEL: load_sext_v4i16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %ap %val = sext <4 x i16> %a to <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 093e6cd9328c8..b1a0c6ca167d8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 @@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index ed03f9b322432..4fb3bf7392d4e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 @@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 3cfb24aaccb11..cd02d18e61643 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x1] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] ; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff -; CHECK-SD-NEXT: ldrb w10, [x0, #1] -; CHECK-SD-NEXT: ldrb w11, [x1, #1] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] @@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x1] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] ; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-SD-NEXT: ldrh w10, [x0, #2] -; CHECK-SD-NEXT: ldrh w11, [x1, #2] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index a71cf95a728db..ef70137e6deee 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x1] -; CHECK-SD-NEXT: ldrb w10, [x0, #1] -; CHECK-SD-NEXT: ldrb w11, [x1, #1] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] @@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x1] -; CHECK-SD-NEXT: ldrh w10, [x0, #2] -; CHECK-SD-NEXT: ldrh w11, [x1, #2] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll index f6553b6acec9d..6d4061fb02cff 100644 --- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll +++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll @@ -1,9 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s -; CHECK-LABEL: convert_v3f32 -; CHECK: strb -; CHECK: strh define void @convert_v3f32() { +; CHECK-LABEL: convert_v3f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: strb wzr, [x8] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret entry: br label %bb diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 25702ef25510c..fde2e2a1429eb 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { ; CHECK-SD-NEXT: shl.16b v0, v0, #7 ; CHECK-SD-NEXT: adrp x8, lCPI20_0@PAGE ; CHECK-SD-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] -; CHECK-SD-NEXT: add x8, sp, #14 ; CHECK-SD-NEXT: cmlt.16b v0, v0, #0 ; CHECK-SD-NEXT: and.16b v0, v0, v1 ; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 ; CHECK-SD-NEXT: zip1.16b v0, v0, v1 ; CHECK-SD-NEXT: addv.8h h0, v0 -; CHECK-SD-NEXT: str h0, [sp, #14] -; CHECK-SD-NEXT: ld1.b { v0 }[0], [x8] -; CHECK-SD-NEXT: orr x8, x8, #0x1 -; CHECK-SD-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-SD-NEXT: ushll.8h v0, v0, #0 +; CHECK-SD-NEXT: ushll.4s v0, v0, #0 ; CHECK-SD-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 7d3f5bc270d6b..60414adba75fc 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ldr s0, [x0] ; BE-NEXT: ldrh w8, [x0, #4] ; BE-NEXT: rev32 v0.4h, v0.4h +; BE-NEXT: strb w8, [x1, #2] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; BE-NEXT: rev32 v0.16b, v0.16b -; BE-NEXT: str s0, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: rev32 v0.4h, v0.4h +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: str h0, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #3] -; BE-NEXT: sturh w8, [x1, #1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: stur h1, [x1, #1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #5] -; BE-NEXT: sturh w8, [x1, #3] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: stur h1, [x1, #3] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; CHECK-NEXT: shrn.4h v0, v0, #16 ; CHECK-NEXT: uzp1.8b v1, v0, v0 ; CHECK-NEXT: mov h0, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w8, [sp, #12] +; CHECK-NEXT: ushll.4s v1, v1, #0 ; CHECK-NEXT: stur b0, [x1, #2] -; CHECK-NEXT: strh w8, [x1] +; CHECK-NEXT: str h1, [x1] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; @@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #8] -; BE-NEXT: ldrh w8, [sp, #8] ; BE-NEXT: stur b0, [x0, #2] -; BE-NEXT: strh w8, [x0] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x0] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i8>, ptr %src, align 1 @@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #8] -; BE-NEXT: ldrh w8, [sp, #8] ; BE-NEXT: stur b0, [x0, #2] -; BE-NEXT: strh w8, [x0] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x0] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i8>, ptr %src, align 1