diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 7c395a9e01ee5..d969d1a08d4f8 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -217,11 +217,19 @@ def mul_const : GICombineRule< (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }]) >; -def lower_mull : GICombineRule< - (defs root:$root), - (match (wip_match_opcode G_MUL):$root, - [{ return matchExtMulToMULL(*${root}, MRI); }]), - (apply [{ applyExtMulToMULL(*${root}, MRI, B, Observer); }]) +def mull_matchdata : GIDefMatchData<"std::tuple">; +def extmultomull : GICombineRule< + (defs root:$root, mull_matchdata:$matchinfo), + (match (G_MUL $dst, $src1, $src2):$root, + [{ return matchExtMulToMULL(*${root}, MRI, VT, ${matchinfo}); }]), + (apply [{ applyExtMulToMULL(*${root}, MRI, B, Observer, ${matchinfo}); }]) +>; + +def lower_mulv2s64 : GICombineRule< + (defs root:$root, mull_matchdata:$matchinfo), + (match (G_MUL $dst, $src1, $src2):$root, + [{ return matchMulv2s64(*${root}, MRI); }]), + (apply [{ applyMulv2s64(*${root}, MRI, B, Observer); }]) >; def build_vector_to_dup : GICombineRule< @@ -316,7 +324,7 @@ def AArch64PostLegalizerLowering icmp_lowering, build_vector_lowering, lower_vector_fcmp, form_truncstore, vector_sext_inreg_to_shift, - unmerge_ext_to_unmerge, lower_mull, + unmerge_ext_to_unmerge, lower_mulv2s64, vector_unmerge_lowering, insertelt_nonconst]> { } @@ -339,5 +347,5 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate]> { + combine_mul_cmlt, combine_use_vector_truncate, extmultomull]> { } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index 96569f77bc224..32c33990ad348 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -438,6 +438,122 @@ void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } +// Match mul({z/s}ext , {z/s}ext) => {u/s}mull +bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI, + GISelValueTracking *KB, + std::tuple &MatchInfo) { + // Get the instructions that defined the source operand + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); + MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + unsigned I1Opc = I1->getOpcode(); + unsigned I2Opc = I2->getOpcode(); + unsigned EltSize = DstTy.getScalarSizeInBits(); + + if (!DstTy.isVector() || I1->getNumOperands() < 2 || I2->getNumOperands() < 2) + return false; + + auto IsAtLeastDoubleExtend = [&](Register R) { + LLT Ty = MRI.getType(R); + return EltSize >= Ty.getScalarSizeInBits() * 2; + }; + + // If the source operands were EXTENDED before, then {U/S}MULL can be used + bool IsZExt1 = + I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_ANYEXT; + bool IsZExt2 = + I2Opc == TargetOpcode::G_ZEXT || I2Opc == TargetOpcode::G_ANYEXT; + if (IsZExt1 && IsZExt2 && IsAtLeastDoubleExtend(I1->getOperand(1).getReg()) && + IsAtLeastDoubleExtend(I2->getOperand(1).getReg())) { + get<0>(MatchInfo) = true; + get<1>(MatchInfo) = I1->getOperand(1).getReg(); + get<2>(MatchInfo) = I2->getOperand(1).getReg(); + return true; + } + + bool IsSExt1 = + I1Opc == TargetOpcode::G_SEXT || I1Opc == TargetOpcode::G_ANYEXT; + bool IsSExt2 = + I2Opc == TargetOpcode::G_SEXT || I2Opc == TargetOpcode::G_ANYEXT; + if (IsSExt1 && IsSExt2 && IsAtLeastDoubleExtend(I1->getOperand(1).getReg()) && + IsAtLeastDoubleExtend(I2->getOperand(1).getReg())) { + get<0>(MatchInfo) = false; + get<1>(MatchInfo) = I1->getOperand(1).getReg(); + get<2>(MatchInfo) = I2->getOperand(1).getReg(); + return true; + } + + // Select UMULL if we can replace the other operand with an extend. + APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2); + if (KB && (IsZExt1 || IsZExt2) && + IsAtLeastDoubleExtend(IsZExt1 ? I1->getOperand(1).getReg() + : I2->getOperand(1).getReg())) { + Register ZExtOp = + IsZExt1 ? MI.getOperand(2).getReg() : MI.getOperand(1).getReg(); + if (KB->maskedValueIsZero(ZExtOp, Mask)) { + get<0>(MatchInfo) = true; + get<1>(MatchInfo) = IsZExt1 ? I1->getOperand(1).getReg() : ZExtOp; + get<2>(MatchInfo) = IsZExt1 ? ZExtOp : I2->getOperand(1).getReg(); + return true; + } + } else if (KB && DstTy == LLT::fixed_vector(2, 64) && + KB->maskedValueIsZero(MI.getOperand(1).getReg(), Mask) && + KB->maskedValueIsZero(MI.getOperand(2).getReg(), Mask)) { + get<0>(MatchInfo) = true; + get<1>(MatchInfo) = MI.getOperand(1).getReg(); + get<2>(MatchInfo) = MI.getOperand(2).getReg(); + return true; + } + + if (KB && (IsSExt1 || IsSExt2) && + IsAtLeastDoubleExtend(IsSExt1 ? I1->getOperand(1).getReg() + : I2->getOperand(1).getReg())) { + Register SExtOp = + IsSExt1 ? MI.getOperand(2).getReg() : MI.getOperand(1).getReg(); + if (KB->computeNumSignBits(SExtOp) > EltSize / 2) { + get<0>(MatchInfo) = false; + get<1>(MatchInfo) = IsSExt1 ? I1->getOperand(1).getReg() : SExtOp; + get<2>(MatchInfo) = IsSExt1 ? SExtOp : I2->getOperand(1).getReg(); + return true; + } + } else if (KB && DstTy == LLT::fixed_vector(2, 64) && + KB->computeNumSignBits(MI.getOperand(1).getReg()) > EltSize / 2 && + KB->computeNumSignBits(MI.getOperand(2).getReg()) > EltSize / 2) { + get<0>(MatchInfo) = false; + get<1>(MatchInfo) = MI.getOperand(1).getReg(); + get<2>(MatchInfo) = MI.getOperand(2).getReg(); + return true; + } + + return false; +} + +void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, GISelChangeObserver &Observer, + std::tuple &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_MUL && + "Expected a G_MUL instruction"); + + // Get the instructions that defined the source operand + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + bool IsZExt = get<0>(MatchInfo); + Register Src1Reg = get<1>(MatchInfo); + Register Src2Reg = get<2>(MatchInfo); + LLT Src1Ty = MRI.getType(Src1Reg); + LLT Src2Ty = MRI.getType(Src2Reg); + LLT HalfDstTy = DstTy.changeElementSize(DstTy.getScalarSizeInBits() / 2); + unsigned ExtOpc = IsZExt ? TargetOpcode::G_ZEXT : TargetOpcode::G_SEXT; + + if (Src1Ty.getScalarSizeInBits() * 2 != DstTy.getScalarSizeInBits()) + Src1Reg = B.buildExtOrTrunc(ExtOpc, {HalfDstTy}, {Src1Reg}).getReg(0); + if (Src2Ty.getScalarSizeInBits() * 2 != DstTy.getScalarSizeInBits()) + Src2Reg = B.buildExtOrTrunc(ExtOpc, {HalfDstTy}, {Src2Reg}).getReg(0); + + B.buildInstr(IsZExt ? AArch64::G_UMULL : AArch64::G_SMULL, + {MI.getOperand(0).getReg()}, {Src1Reg, Src2Reg}); + MI.eraseFromParent(); +} + class AArch64PostLegalizerCombinerImpl : public Combiner { protected: const CombinerHelper Helper; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index dea08d98f524f..4785c7b68d94d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -1190,68 +1190,24 @@ void applyUnmergeExtToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, // Doing these two matches in one function to ensure that the order of matching // will always be the same. // Try lowering MUL to MULL before trying to scalarize if needed. -bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI) { +bool matchMulv2s64(MachineInstr &MI, MachineRegisterInfo &MRI) { // Get the instructions that defined the source operand LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); - MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); - - if (DstTy.isVector()) { - // If the source operands were EXTENDED before, then {U/S}MULL can be used - unsigned I1Opc = I1->getOpcode(); - unsigned I2Opc = I2->getOpcode(); - if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) || - (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) && - (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() == - MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) && - (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() == - MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) { - return true; - } - // If result type is v2s64, scalarise the instruction - else if (DstTy == LLT::fixed_vector(2, 64)) { - return true; - } - } - return false; + return DstTy == LLT::fixed_vector(2, 64); } -void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, GISelChangeObserver &Observer) { +void applyMulv2s64(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, GISelChangeObserver &Observer) { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL instruction"); // Get the instructions that defined the source operand LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); - MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); - - // If the source operands were EXTENDED before, then {U/S}MULL can be used - unsigned I1Opc = I1->getOpcode(); - unsigned I2Opc = I2->getOpcode(); - if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) || - (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) && - (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() == - MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) && - (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() == - MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) { - - B.setInstrAndDebugLoc(MI); - B.buildInstr(I1->getOpcode() == TargetOpcode::G_ZEXT ? AArch64::G_UMULL - : AArch64::G_SMULL, - {MI.getOperand(0).getReg()}, - {I1->getOperand(1).getReg(), I2->getOperand(1).getReg()}); - MI.eraseFromParent(); - } - // If result type is v2s64, scalarise the instruction - else if (DstTy == LLT::fixed_vector(2, 64)) { - LegalizerHelper Helper(*MI.getMF(), Observer, B); - B.setInstrAndDebugLoc(MI); - Helper.fewerElementsVector( - MI, 0, - DstTy.changeElementCount( - DstTy.getElementCount().divideCoefficientBy(2))); - } + assert(DstTy == LLT::fixed_vector(2, 64) && "Expected v2s64 Mul"); + LegalizerHelper Helper(*MI.getMF(), Observer, B); + Helper.fewerElementsVector( + MI, 0, + DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2))); } class AArch64PostLegalizerLoweringImpl : public Combiner { diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 3b589d3480179..714be46a015f4 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -73,14 +73,10 @@ define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind { ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d0, [x0] -; CHECK-GI-NEXT: ldr q1, [x1] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-GI-NEXT: ldr q2, [x1] +; CHECK-GI-NEXT: ushll v1.8h, v0.8b, #0 +; CHECK-GI-NEXT: smull v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: smull2 v1.4s, v1.8h, v2.8h ; CHECK-GI-NEXT: ret %load.A = load <8 x i8>, ptr %A %load.B = load <8 x i16>, ptr %B @@ -112,14 +108,10 @@ define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounw ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d0, [x1] -; CHECK-GI-NEXT: ldr q1, [x0] -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: mul v0.4s, v2.4s, v3.4s -; CHECK-GI-NEXT: mul v1.4s, v1.4s, v4.4s +; CHECK-GI-NEXT: ldr q2, [x0] +; CHECK-GI-NEXT: ushll v1.8h, v0.8b, #0 +; CHECK-GI-NEXT: smull v0.4s, v2.4h, v1.4h +; CHECK-GI-NEXT: smull2 v1.4s, v2.8h, v1.8h ; CHECK-GI-NEXT: ret %load.A = load <8 x i16>, ptr %A %load.B = load <8 x i8>, ptr %B @@ -258,20 +250,10 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b -; CHECK-GI-NEXT: ldr d1, [x1] -; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-GI-NEXT: mov w8, v0.s[0] -; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: ldr d0, [x1] +; CHECK-GI-NEXT: smull v0.2d, v1.2s, v0.2s ; CHECK-GI-NEXT: ret %load.A = load <2 x i16>, ptr %A %load.B = load <2 x i32>, ptr %B @@ -304,16 +286,7 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: ldr d1, [x1] -; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret %load.A = load <2 x i32>, ptr %A %and.A = and <2 x i32> %load.A, @@ -935,24 +908,11 @@ define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements. define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { -; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: movi v1.8b, #244 -; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: movi v1.8b, #244 -; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mvni v1.8h, #11 -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: smull_extvec_v8i8_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.8b, #244 +; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ret %tmp3 = sext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, ret <8 x i16> %tmp4 @@ -989,24 +949,11 @@ define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { } define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { -; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: mvni v1.4h, #11 -; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: mvni v1.4h, #11 -; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mvni v1.4s, #11 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: smull_extvec_v4i16_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mvni v1.4h, #11 +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ret %tmp3 = sext <4 x i16> %arg to <4 x i32> %tmp4 = mul <4 x i32> %tmp3, ret <4 x i32> %tmp4 @@ -1030,16 +977,8 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI36_0 -; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI36_0] +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret %tmp3 = sext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1047,24 +986,11 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { } define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { -; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: movi v1.8b, #12 -; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: movi v1.8b, #12 -; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.8h, #12 -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: umull_extvec_v8i8_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.8b, #12 +; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ret %tmp3 = zext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, ret <8 x i16> %tmp4 @@ -1118,9 +1044,8 @@ define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI39_0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0] -; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI39_0] +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret %tmp3 = zext <4 x i16> %arg to <4 x i32> %tmp4 = mul <4 x i32> %tmp3, @@ -1145,16 +1070,8 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI40_0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI40_0] +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1178,10 +1095,9 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; ; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.8h, #12 -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: movi v1.8b, #12 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <8 x i8> %arg to <8 x i16> @@ -1212,10 +1128,9 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI42_0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_0] -; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI42_0] +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <4 x i16> %arg to <4 x i32> @@ -1246,18 +1161,10 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI43_0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI43_0] +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1635,9 +1542,9 @@ define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-GI-LABEL: umull_and_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -1664,9 +1571,9 @@ define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-GI-LABEL: umull_and_v8i16_c: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: umull v0.8h, v1.8b, v0.8b ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -1705,9 +1612,8 @@ define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; ; CHECK-GI-LABEL: umull_andconst_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: movi v1.2d, #0xff00ff00ff00ff -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: movi d1, #0xffffffffffffffff +; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -1751,29 +1657,13 @@ entry: } define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) { -; CHECK-NEON-LABEL: umull_and_v4i32: -; CHECK-NEON: // %bb.0: // %entry -; CHECK-NEON-NEXT: movi v2.2d, #0x0000ff000000ff -; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEON-NEXT: xtn v1.4h, v1.4s -; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: umull_and_v4i32: -; CHECK-SVE: // %bb.0: // %entry -; CHECK-SVE-NEXT: movi v2.2d, #0x0000ff000000ff -; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-SVE-NEXT: xtn v1.4h, v1.4s -; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: umull_and_v4i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: umull_and_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ret entry: %in1 = zext <4 x i16> %src1 to <4 x i32> %in2 = and <4 x i32> %src2, @@ -1805,12 +1695,13 @@ define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) { ; CHECK-GI-LABEL: umull_and_v8i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff -; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: and v0.16b, v1.16b, v3.16b -; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: mul v0.4s, v4.4s, v0.4s -; CHECK-GI-NEXT: mul v1.4s, v5.4s, v1.4s +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: xtn v2.4h, v2.4s +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v1.4s, v3.4h, v2.4h ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> @@ -1855,36 +1746,13 @@ entry: } define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { -; CHECK-NEON-LABEL: umull_and_v2i64: -; CHECK-NEON: // %bb.0: // %entry -; CHECK-NEON-NEXT: movi v2.2d, #0x000000000000ff -; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEON-NEXT: xtn v1.2s, v1.2d -; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: umull_and_v2i64: -; CHECK-SVE: // %bb.0: // %entry -; CHECK-SVE-NEXT: movi v2.2d, #0x000000000000ff -; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-SVE-NEXT: xtn v1.2s, v1.2d -; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: umull_and_v2i64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: umull_and_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000000000ff +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret entry: %in1 = zext <2 x i32> %src1 to <2 x i64> %in2 = and <2 x i64> %src2, @@ -1916,26 +1784,13 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { ; CHECK-GI-LABEL: umull_and_v4i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v3.2d, #0x000000000000ff -; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: fmov x8, d4 -; CHECK-GI-NEXT: mov x10, v4.d[1] -; CHECK-GI-NEXT: mov x13, v0.d[1] ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov x12, d2 -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mov x14, v2.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d0 -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x9, x9, x12 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mul x11, x13, x14 -; CHECK-GI-NEXT: mov v1.d[0], x9 -; CHECK-GI-NEXT: mov v0.d[1], x10 -; CHECK-GI-NEXT: mov v1.d[1], x11 +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: xtn v2.2s, v2.2d +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: umull v1.2d, v3.2s, v2.2s ; CHECK-GI-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> @@ -2397,33 +2252,12 @@ define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) { } define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) { -; CHECK-NEON-LABEL: lsr: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-NEON-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: lsr: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-SVE-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: lsr: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #32 -; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #32 -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: lsr: +; CHECK: // %bb.0: +; CHECK-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret %x = lshr <2 x i64> %a, %y = lshr <2 x i64> %b, %z = mul nsw <2 x i64> %x, %y @@ -2431,34 +2265,12 @@ define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) { -; CHECK-NEON-LABEL: lsr_const: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: movi v1.2s, #31 -; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: lsr_const: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: movi v1.2s, #31 -; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: lsr_const: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI79_0 -; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #32 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI79_0] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: lsr_const: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2s, #31 +; CHECK-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret %x = lshr <2 x i64> %a, %z = mul nsw <2 x i64> %x, ret <2 x i64> %z @@ -2629,10 +2441,10 @@ define <8 x i16> @smulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind ; ; CHECK-GI-LABEL: smulladdl_const_v8i8_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.8h, #10 -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: saddw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: movi v2.8b, #10 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: smlal v1.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp1 = sext <8 x i8> %A to <8 x i16> %tmp3 = sext <8 x i8> %C to <8 x i16> @@ -2658,10 +2470,10 @@ define <8 x i16> @umulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind ; ; CHECK-GI-LABEL: umulladdl_const_v8i8_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.8h, #10 -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: movi v2.8b, #10 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: umlal v1.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp1 = zext <8 x i8> %A to <8 x i16> %tmp3 = zext <8 x i8> %C to <8 x i16> @@ -2942,18 +2754,10 @@ define <2 x i64> @smulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwi ; ; CHECK-GI-LABEL: smulladdl_const_v2i32_v2i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI98_0 -; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI98_0] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v2.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: saddw v0.2d, v0.2d, v1.2s +; CHECK-GI-NEXT: movi v2.2s, #10 +; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 +; CHECK-GI-NEXT: smlal v1.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp1 = sext <2 x i32> %A to <2 x i64> %tmp3 = sext <2 x i32> %C to <2 x i64> @@ -2979,18 +2783,10 @@ define <2 x i64> @umulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwi ; ; CHECK-GI-LABEL: umulladdl_const_v2i32_v2i64: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI99_0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI99_0] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mov x10, v0.d[1] -; CHECK-GI-NEXT: mov x11, v2.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-GI-NEXT: movi v2.2s, #10 +; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-GI-NEXT: umlal v1.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp1 = zext <2 x i32> %A to <2 x i64> %tmp3 = zext <2 x i32> %C to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 6fb4e219d39f4..2d3fda704908e 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -812,12 +812,8 @@ define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s -; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: smull v2.4s, v1.4h, v0.4h +; CHECK-GI-NEXT: smlal2 v2.4s, v1.8h, v0.8h ; CHECK-GI-NEXT: addv s0, v2.4s ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret @@ -848,12 +844,8 @@ define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr noca ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s -; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: smull v2.4s, v1.4h, v0.4h +; CHECK-GI-NEXT: smlal2 v2.4s, v1.8h, v0.8h ; CHECK-GI-NEXT: addv s0, v2.4s ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret @@ -965,18 +957,10 @@ define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly % ; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 ; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll2 v7.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s -; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s -; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s -; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: smull v4.4s, v3.4h, v2.4h +; CHECK-GI-NEXT: smull v5.4s, v1.4h, v0.4h +; CHECK-GI-NEXT: smlal2 v4.4s, v3.8h, v2.8h +; CHECK-GI-NEXT: smlal2 v5.4s, v1.8h, v0.8h ; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w8, s0 @@ -1013,18 +997,10 @@ define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr noc ; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 ; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 ; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v6.4s, v3.8h, #0 -; CHECK-GI-NEXT: ushll2 v7.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s -; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s -; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s -; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: smull v4.4s, v3.4h, v2.4h +; CHECK-GI-NEXT: smull v5.4s, v1.4h, v0.4h +; CHECK-GI-NEXT: smlal2 v4.4s, v3.8h, v2.8h +; CHECK-GI-NEXT: smlal2 v5.4s, v1.8h, v0.8h ; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w8, s0 @@ -1332,18 +1308,10 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll2 v6.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: smull v4.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smull v5.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: smlal2 v4.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smlal2 v5.4s, v2.8h, v3.8h ; CHECK-GI-NEXT: addv s0, v4.4s ; CHECK-GI-NEXT: addv s1, v5.4s ; CHECK-GI-NEXT: fmov w8, s0 @@ -1381,18 +1349,10 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: smull v4.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smull v5.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: smlal2 v4.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smlal2 v5.4s, v2.8h, v3.8h ; CHECK-GI-NEXT: addv s0, v4.4s ; CHECK-GI-NEXT: addv s1, v5.4s ; CHECK-GI-NEXT: fmov w8, s0 @@ -1431,33 +1391,17 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1 ; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0 ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 ; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0 -; CHECK-GI-NEXT: sshll v7.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: sshll v7.8h, v3.8b, #0 ; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0 -; CHECK-GI-NEXT: ushll2 v16.4s, v4.8h, #0 -; CHECK-GI-NEXT: ushll2 v17.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0 -; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0 -; CHECK-GI-NEXT: sshll2 v21.4s, v7.8h, #0 -; CHECK-GI-NEXT: ushll2 v22.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll2 v23.4s, v3.8h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 -; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s -; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s -; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s -; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: smull v16.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: smull v17.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smull v18.4s, v6.4h, v7.4h +; CHECK-GI-NEXT: smull v19.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: smlal2 v16.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: smlal2 v17.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smlal2 v18.4s, v6.8h, v7.8h +; CHECK-GI-NEXT: smlal2 v19.4s, v2.8h, v3.8h ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s ; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s ; CHECK-GI-NEXT: addv s0, v0.4s @@ -1499,33 +1443,17 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, ; CHECK-GI-NEXT: ushll v5.8h, v1.8b, #0 ; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 ; CHECK-GI-NEXT: sshll v6.8h, v2.8b, #0 -; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 ; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0 -; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v18.4s, v5.8h, #0 -; CHECK-GI-NEXT: ushll2 v19.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll2 v20.4s, v6.8h, #0 -; CHECK-GI-NEXT: ushll2 v21.4s, v7.8h, #0 -; CHECK-GI-NEXT: sshll2 v22.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 -; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s -; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s -; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: smull v16.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: smull v17.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smull v18.4s, v6.4h, v7.4h +; CHECK-GI-NEXT: smull v19.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: smlal2 v16.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: smlal2 v17.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smlal2 v18.4s, v6.8h, v7.8h +; CHECK-GI-NEXT: smlal2 v19.4s, v2.8h, v3.8h ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s ; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s ; CHECK-GI-NEXT: addv s0, v0.4s @@ -3858,30 +3786,14 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly % ; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0 -; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0 -; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0 -; CHECK-GI-NEXT: ushll2 v21.4s, v2.8h, #0 -; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0 -; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 -; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mul v16.4s, v16.4s, v20.4s -; CHECK-GI-NEXT: mul v17.4s, v17.4s, v21.4s -; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-GI-NEXT: mul v18.4s, v18.4s, v22.4s -; CHECK-GI-NEXT: mul v19.4s, v19.4s, v23.4s -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mla v16.4s, v4.4s, v6.4s -; CHECK-GI-NEXT: mla v17.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: mla v18.4s, v5.4s, v7.4s -; CHECK-GI-NEXT: mla v19.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: smull v16.4s, v4.4h, v6.4h +; CHECK-GI-NEXT: smull v17.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: smull v18.4s, v5.4h, v7.4h +; CHECK-GI-NEXT: smull v19.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: smlal2 v16.4s, v4.8h, v6.8h +; CHECK-GI-NEXT: smlal2 v17.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: smlal2 v18.4s, v5.8h, v7.8h +; CHECK-GI-NEXT: smlal2 v19.4s, v1.8h, v3.8h ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s ; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s @@ -3920,19 +3832,6 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 ; ; CHECK-GI-LABEL: test_usdot_v32i8_double: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 -; CHECK-GI-NEXT: .cfi_offset b8, -8 -; CHECK-GI-NEXT: .cfi_offset b9, -16 -; CHECK-GI-NEXT: .cfi_offset b10, -24 -; CHECK-GI-NEXT: .cfi_offset b11, -32 -; CHECK-GI-NEXT: .cfi_offset b12, -40 -; CHECK-GI-NEXT: .cfi_offset b13, -48 -; CHECK-GI-NEXT: .cfi_offset b14, -56 -; CHECK-GI-NEXT: .cfi_offset b15, -64 ; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 ; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0 @@ -3941,69 +3840,34 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 ; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0 ; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0 -; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0 +; CHECK-GI-NEXT: ushll v20.8h, v4.8b, #0 ; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0 -; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0 -; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0 -; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0 +; CHECK-GI-NEXT: ushll v21.8h, v5.8b, #0 ; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0 -; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0 +; CHECK-GI-NEXT: sshll v22.8h, v6.8b, #0 +; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0 +; CHECK-GI-NEXT: sshll v23.8h, v7.8b, #0 ; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0 -; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0 -; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0 -; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0 -; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0 -; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0 -; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0 -; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0 -; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0 -; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0 -; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0 -; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0 -; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0 -; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0 -; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s -; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s -; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s -; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s -; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s -; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s -; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s -; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s -; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0 -; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 -; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0 -; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 -; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0 -; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s -; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s -; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s -; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s -; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s -; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s -; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s -; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s -; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s -; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s +; CHECK-GI-NEXT: smull v24.4s, v16.4h, v18.4h +; CHECK-GI-NEXT: smull v25.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: smull v26.4s, v17.4h, v19.4h +; CHECK-GI-NEXT: smull v27.4s, v1.4h, v3.4h +; CHECK-GI-NEXT: smull v28.4s, v20.4h, v22.4h +; CHECK-GI-NEXT: smull v29.4s, v4.4h, v6.4h +; CHECK-GI-NEXT: smull v30.4s, v21.4h, v23.4h +; CHECK-GI-NEXT: smull v31.4s, v5.4h, v7.4h +; CHECK-GI-NEXT: smlal2 v24.4s, v16.8h, v18.8h +; CHECK-GI-NEXT: smlal2 v25.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: smlal2 v26.4s, v17.8h, v19.8h +; CHECK-GI-NEXT: smlal2 v27.4s, v1.8h, v3.8h +; CHECK-GI-NEXT: smlal2 v28.4s, v20.8h, v22.8h +; CHECK-GI-NEXT: smlal2 v29.4s, v4.8h, v6.8h +; CHECK-GI-NEXT: smlal2 v30.4s, v21.8h, v23.8h +; CHECK-GI-NEXT: smlal2 v31.4s, v5.8h, v7.8h +; CHECK-GI-NEXT: add v0.4s, v24.4s, v25.4s +; CHECK-GI-NEXT: add v1.4s, v26.4s, v27.4s +; CHECK-GI-NEXT: add v2.4s, v28.4s, v29.4s +; CHECK-GI-NEXT: add v3.4s, v30.4s, v31.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: addv s0, v0.4s @@ -4011,7 +3875,6 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w0, w8, w9 -; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %az = zext <32 x i8> %a to <32 x i32> @@ -7415,101 +7278,52 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly % ; ; CHECK-GI-LABEL: test_usdot_v64i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 -; CHECK-GI-NEXT: .cfi_offset b8, -8 -; CHECK-GI-NEXT: .cfi_offset b9, -16 -; CHECK-GI-NEXT: .cfi_offset b10, -24 -; CHECK-GI-NEXT: .cfi_offset b11, -32 -; CHECK-GI-NEXT: .cfi_offset b12, -40 -; CHECK-GI-NEXT: .cfi_offset b13, -48 -; CHECK-GI-NEXT: .cfi_offset b14, -56 -; CHECK-GI-NEXT: .cfi_offset b15, -64 -; CHECK-GI-NEXT: ldp q0, q1, [x1] -; CHECK-GI-NEXT: ldp q21, q17, [x0] -; CHECK-GI-NEXT: ldp q3, q19, [x1, #32] -; CHECK-GI-NEXT: ldp q18, q4, [x0, #32] -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v5.8h, v0.16b, #0 -; CHECK-GI-NEXT: sshll v7.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll2 v22.8h, v1.16b, #0 -; CHECK-GI-NEXT: sshll v23.8h, v3.8b, #0 -; CHECK-GI-NEXT: sshll2 v24.8h, v3.16b, #0 -; CHECK-GI-NEXT: sshll v25.8h, v19.8b, #0 -; CHECK-GI-NEXT: sshll2 v26.8h, v19.16b, #0 -; CHECK-GI-NEXT: ushll v27.8h, v21.8b, #0 -; CHECK-GI-NEXT: ushll2 v28.8h, v21.16b, #0 -; CHECK-GI-NEXT: ushll v30.8h, v17.8b, #0 -; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0 -; CHECK-GI-NEXT: ushll v8.8h, v18.8b, #0 -; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0 -; CHECK-GI-NEXT: ushll v9.8h, v4.8b, #0 +; CHECK-GI-NEXT: ldp q0, q1, [x0] +; CHECK-GI-NEXT: ldp q2, q5, [x1] +; CHECK-GI-NEXT: ldp q3, q4, [x0, #32] +; CHECK-GI-NEXT: ldp q6, q7, [x1, #32] +; CHECK-GI-NEXT: ushll v20.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v16.8h, v2.8b, #0 +; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: sshll v17.8h, v5.8b, #0 +; CHECK-GI-NEXT: sshll2 v5.8h, v5.16b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll v21.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v18.8h, v6.8b, #0 +; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0 +; CHECK-GI-NEXT: sshll v19.8h, v7.8b, #0 +; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v22.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-GI-NEXT: ushll v23.8h, v4.8b, #0 ; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0 -; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v1.4s, v5.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v5.8h, #0 -; CHECK-GI-NEXT: sshll v2.4s, v7.4h, #0 -; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0 -; CHECK-GI-NEXT: sshll v3.4s, v22.4h, #0 -; CHECK-GI-NEXT: sshll2 v22.4s, v22.8h, #0 -; CHECK-GI-NEXT: sshll v5.4s, v23.4h, #0 -; CHECK-GI-NEXT: sshll2 v23.4s, v23.8h, #0 -; CHECK-GI-NEXT: sshll v7.4s, v24.4h, #0 -; CHECK-GI-NEXT: sshll2 v24.4s, v24.8h, #0 -; CHECK-GI-NEXT: sshll v19.4s, v25.4h, #0 -; CHECK-GI-NEXT: sshll2 v25.4s, v25.8h, #0 -; CHECK-GI-NEXT: sshll v21.4s, v26.4h, #0 -; CHECK-GI-NEXT: sshll2 v26.4s, v26.8h, #0 -; CHECK-GI-NEXT: ushll v29.4s, v27.4h, #0 -; CHECK-GI-NEXT: ushll2 v27.4s, v27.8h, #0 -; CHECK-GI-NEXT: ushll v31.4s, v28.4h, #0 -; CHECK-GI-NEXT: ushll2 v28.4s, v28.8h, #0 -; CHECK-GI-NEXT: ushll v10.4s, v30.4h, #0 -; CHECK-GI-NEXT: ushll2 v30.4s, v30.8h, #0 -; CHECK-GI-NEXT: ushll v11.4s, v17.4h, #0 -; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0 -; CHECK-GI-NEXT: ushll2 v12.4s, v8.8h, #0 -; CHECK-GI-NEXT: ushll2 v13.4s, v18.8h, #0 -; CHECK-GI-NEXT: ushll2 v14.4s, v9.8h, #0 -; CHECK-GI-NEXT: ushll2 v15.4s, v4.8h, #0 -; CHECK-GI-NEXT: mul v6.4s, v6.4s, v27.4s -; CHECK-GI-NEXT: mul v16.4s, v16.4s, v28.4s -; CHECK-GI-NEXT: mul v20.4s, v20.4s, v30.4s -; CHECK-GI-NEXT: mul v17.4s, v22.4s, v17.4s -; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0 -; CHECK-GI-NEXT: mul v22.4s, v23.4s, v12.4s -; CHECK-GI-NEXT: mul v23.4s, v24.4s, v13.4s -; CHECK-GI-NEXT: mul v24.4s, v25.4s, v14.4s -; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mul v25.4s, v26.4s, v15.4s -; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 -; CHECK-GI-NEXT: ushll v26.4s, v9.4h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: mla v6.4s, v0.4s, v29.4s -; CHECK-GI-NEXT: mla v16.4s, v1.4s, v31.4s -; CHECK-GI-NEXT: mla v20.4s, v2.4s, v10.4s -; CHECK-GI-NEXT: mla v17.4s, v3.4s, v11.4s -; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: mla v22.4s, v5.4s, v8.4s -; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mla v23.4s, v7.4s, v18.4s -; CHECK-GI-NEXT: mla v24.4s, v19.4s, v26.4s -; CHECK-GI-NEXT: mla v25.4s, v21.4s, v4.4s -; CHECK-GI-NEXT: add v0.4s, v6.4s, v16.4s -; CHECK-GI-NEXT: add v1.4s, v20.4s, v17.4s -; CHECK-GI-NEXT: add v2.4s, v22.4s, v23.4s -; CHECK-GI-NEXT: add v3.4s, v24.4s, v25.4s +; CHECK-GI-NEXT: smull v24.4s, v16.4h, v20.4h +; CHECK-GI-NEXT: smull v25.4s, v2.4h, v0.4h +; CHECK-GI-NEXT: smull v26.4s, v17.4h, v21.4h +; CHECK-GI-NEXT: smull v27.4s, v5.4h, v1.4h +; CHECK-GI-NEXT: smull v28.4s, v18.4h, v22.4h +; CHECK-GI-NEXT: smull v29.4s, v6.4h, v3.4h +; CHECK-GI-NEXT: smull v30.4s, v19.4h, v23.4h +; CHECK-GI-NEXT: smull v31.4s, v7.4h, v4.4h +; CHECK-GI-NEXT: smlal2 v24.4s, v16.8h, v20.8h +; CHECK-GI-NEXT: smlal2 v25.4s, v2.8h, v0.8h +; CHECK-GI-NEXT: smlal2 v26.4s, v17.8h, v21.8h +; CHECK-GI-NEXT: smlal2 v27.4s, v5.8h, v1.8h +; CHECK-GI-NEXT: smlal2 v28.4s, v18.8h, v22.8h +; CHECK-GI-NEXT: smlal2 v29.4s, v6.8h, v3.8h +; CHECK-GI-NEXT: smlal2 v30.4s, v19.8h, v23.8h +; CHECK-GI-NEXT: smlal2 v31.4s, v7.8h, v4.8h +; CHECK-GI-NEXT: add v0.4s, v24.4s, v25.4s +; CHECK-GI-NEXT: add v1.4s, v26.4s, v27.4s +; CHECK-GI-NEXT: add v2.4s, v28.4s, v29.4s +; CHECK-GI-NEXT: add v3.4s, v30.4s, v31.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w2 -; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %0 = load <64 x i8>, ptr %a @@ -7558,13 +7372,13 @@ define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <6 ; ; CHECK-GI-LABEL: test_usdot_v64i8_double: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sub sp, sp, #304 -; CHECK-GI-NEXT: stp d15, d14, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d13, d12, [sp, #240] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #256] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #272] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x29, [sp, #288] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 304 +; CHECK-GI-NEXT: sub sp, sp, #240 +; CHECK-GI-NEXT: stp d15, d14, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d13, d12, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x29, [sp, #224] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 240 ; CHECK-GI-NEXT: .cfi_offset w29, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 @@ -7574,190 +7388,114 @@ define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <6 ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: ushll v17.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-GI-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v20.16b, v3.16b -; CHECK-GI-NEXT: ushll v16.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll2 v18.8h, v1.16b, #0 -; CHECK-GI-NEXT: ushll v26.8h, v2.8b, #0 -; CHECK-GI-NEXT: ldp q27, q28, [sp, #304] -; CHECK-GI-NEXT: ushll2 v29.8h, v2.16b, #0 -; CHECK-GI-NEXT: ushll v2.4s, v17.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll v8.8h, v4.8b, #0 -; CHECK-GI-NEXT: ldp q23, q21, [sp, #368] -; CHECK-GI-NEXT: sshll2 v9.8h, v4.16b, #0 -; CHECK-GI-NEXT: sshll2 v11.8h, v5.16b, #0 -; CHECK-GI-NEXT: mov v25.16b, v7.16b -; CHECK-GI-NEXT: ushll2 v19.4s, v17.8h, #0 -; CHECK-GI-NEXT: stp q1, q2, [sp, #192] // 32-byte Folded Spill -; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v17.4s, v18.8h, #0 -; CHECK-GI-NEXT: ldp q24, q22, [sp, #336] -; CHECK-GI-NEXT: sshll v10.8h, v5.8b, #0 -; CHECK-GI-NEXT: sshll v12.8h, v6.8b, #0 -; CHECK-GI-NEXT: sshll2 v13.8h, v6.16b, #0 -; CHECK-GI-NEXT: mov v2.16b, v20.16b -; CHECK-GI-NEXT: sshll2 v0.4s, v8.8h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v9.8h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v11.8h, #0 -; CHECK-GI-NEXT: ushll2 v7.4s, v16.8h, #0 -; CHECK-GI-NEXT: ushll2 v31.4s, v29.8h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v10.8h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v13.8h, #0 -; CHECK-GI-NEXT: ushll2 v30.4s, v26.8h, #0 -; CHECK-GI-NEXT: ushll v14.8h, v2.8b, #0 -; CHECK-GI-NEXT: mul v20.4s, v19.4s, v0.4s -; CHECK-GI-NEXT: mul v19.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: sshll v0.8h, v25.8b, #0 -; CHECK-GI-NEXT: mul v4.4s, v17.4s, v6.4s -; CHECK-GI-NEXT: sshll2 v15.4s, v12.8h, #0 -; CHECK-GI-NEXT: ldp q17, q3, [sp, #400] -; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s -; CHECK-GI-NEXT: mul v7.4s, v31.4s, v1.4s -; CHECK-GI-NEXT: ushll2 v31.8h, v2.16b, #0 -; CHECK-GI-NEXT: sshll2 v25.8h, v25.16b, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v14.4h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: str q3, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: ushll2 v3.4s, v14.8h, #0 -; CHECK-GI-NEXT: mul v6.4s, v30.4s, v15.4s -; CHECK-GI-NEXT: str q31, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: ushll v30.4s, v26.4h, #0 -; CHECK-GI-NEXT: sshll v26.4s, v8.4h, #0 -; CHECK-GI-NEXT: ushll v14.8h, v27.8b, #0 -; CHECK-GI-NEXT: ushll v15.4s, v29.4h, #0 -; CHECK-GI-NEXT: sshll v29.4s, v9.4h, #0 -; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s -; CHECK-GI-NEXT: ushll2 v3.4s, v31.8h, #0 -; CHECK-GI-NEXT: ushll v31.8h, v28.8b, #0 -; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 -; CHECK-GI-NEXT: sshll v8.4s, v10.4h, #0 -; CHECK-GI-NEXT: sshll v9.4s, v11.4h, #0 -; CHECK-GI-NEXT: sshll v10.4s, v12.4h, #0 -; CHECK-GI-NEXT: sshll v11.4s, v13.4h, #0 -; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 -; CHECK-GI-NEXT: stp q3, q25, [sp, #112] // 32-byte Folded Spill -; CHECK-GI-NEXT: ldr q3, [sp, #208] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll2 v28.8h, v28.16b, #0 -; CHECK-GI-NEXT: mla v1.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: ushll2 v0.4s, v31.8h, #0 -; CHECK-GI-NEXT: mla v5.4s, v16.4s, v8.4s -; CHECK-GI-NEXT: mla v20.4s, v3.4s, v26.4s -; CHECK-GI-NEXT: sshll2 v3.4s, v25.8h, #0 -; CHECK-GI-NEXT: mla v6.4s, v30.4s, v10.4s -; CHECK-GI-NEXT: mla v7.4s, v15.4s, v11.4s -; CHECK-GI-NEXT: sshll v25.8h, v23.8b, #0 -; CHECK-GI-NEXT: mla v4.4s, v18.4s, v9.4s -; CHECK-GI-NEXT: ushll v30.8h, v22.8b, #0 -; CHECK-GI-NEXT: ushll2 v26.8h, v22.16b, #0 -; CHECK-GI-NEXT: sshll v22.8h, v21.8b, #0 -; CHECK-GI-NEXT: str q3, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll2 v8.8h, v27.16b, #0 -; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr q9, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll2 v1.4s, v14.8h, #0 -; CHECK-GI-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill -; CHECK-GI-NEXT: mla v19.4s, v3.4s, v29.4s -; CHECK-GI-NEXT: sshll2 v7.4s, v25.8h, #0 -; CHECK-GI-NEXT: str q5, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: ushll v29.8h, v24.8b, #0 -; CHECK-GI-NEXT: ushll2 v27.8h, v24.16b, #0 -; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill -; CHECK-GI-NEXT: ldp q0, q16, [sp, #96] // 32-byte Folded Reload -; CHECK-GI-NEXT: str q4, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: sshll2 v24.8h, v23.16b, #0 -; CHECK-GI-NEXT: ushll2 v18.4s, v26.8h, #0 -; CHECK-GI-NEXT: stp q19, q20, [sp, #192] // 32-byte Folded Spill -; CHECK-GI-NEXT: sshll2 v20.8h, v21.16b, #0 -; CHECK-GI-NEXT: sshll v21.8h, v17.8b, #0 +; CHECK-GI-NEXT: ushll v31.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v8.8h, v0.16b, #0 +; CHECK-GI-NEXT: ldr x29, [sp, #224] // 8-byte Folded Reload +; CHECK-GI-NEXT: sshll v11.8h, v4.8b, #0 +; CHECK-GI-NEXT: sshll2 v12.8h, v4.16b, #0 +; CHECK-GI-NEXT: ushll v9.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v10.8h, v1.16b, #0 +; CHECK-GI-NEXT: ldp q25, q22, [sp, #240] +; CHECK-GI-NEXT: sshll v13.8h, v5.8b, #0 +; CHECK-GI-NEXT: sshll2 v14.8h, v5.16b, #0 +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 +; CHECK-GI-NEXT: smull v19.4s, v31.4h, v11.4h +; CHECK-GI-NEXT: ldp q21, q18, [sp, #272] +; CHECK-GI-NEXT: smull v20.4s, v8.4h, v12.4h +; CHECK-GI-NEXT: sshll v4.8h, v6.8b, #0 +; CHECK-GI-NEXT: sshll2 v5.8h, v6.16b, #0 +; CHECK-GI-NEXT: smull v28.4s, v9.4h, v13.4h +; CHECK-GI-NEXT: ldp q17, q16, [sp, #304] +; CHECK-GI-NEXT: smull v27.4s, v10.4h, v14.4h +; CHECK-GI-NEXT: sshll v6.8h, v7.8b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v2.16b, #0 +; CHECK-GI-NEXT: smlal2 v19.4s, v31.8h, v11.8h +; CHECK-GI-NEXT: ldp q30, q29, [sp, #336] +; CHECK-GI-NEXT: smlal2 v20.4s, v8.8h, v12.8h +; CHECK-GI-NEXT: ushll v2.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-GI-NEXT: smlal2 v28.4s, v9.8h, v13.8h +; CHECK-GI-NEXT: stp q0, q6, [sp, #48] // 32-byte Folded Spill +; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0 +; CHECK-GI-NEXT: smlal2 v27.4s, v10.8h, v14.8h +; CHECK-GI-NEXT: smull v26.4s, v0.4h, v4.4h +; CHECK-GI-NEXT: ushll v31.8h, v25.8b, #0 +; CHECK-GI-NEXT: str q19, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: ushll2 v25.8h, v25.16b, #0 +; CHECK-GI-NEXT: ushll v8.8h, v22.8b, #0 +; CHECK-GI-NEXT: stp q2, q1, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: ushll2 v22.8h, v22.16b, #0 +; CHECK-GI-NEXT: ushll v9.8h, v21.8b, #0 +; CHECK-GI-NEXT: stp q5, q28, [sp, #80] // 32-byte Folded Spill +; CHECK-GI-NEXT: ushll2 v21.8h, v21.16b, #0 +; CHECK-GI-NEXT: ushll v10.8h, v18.8b, #0 +; CHECK-GI-NEXT: stp q4, q20, [sp, #112] // 32-byte Folded Spill +; CHECK-GI-NEXT: ushll2 v20.8h, v18.16b, #0 +; CHECK-GI-NEXT: sshll v11.8h, v17.8b, #0 +; CHECK-GI-NEXT: str q27, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q28, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q27, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: sshll2 v19.8h, v17.16b, #0 -; CHECK-GI-NEXT: sshll2 v17.8h, v0.16b, #0 -; CHECK-GI-NEXT: mul v16.4s, v16.4s, v9.4s -; CHECK-GI-NEXT: ldr q9, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: sshll v23.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v22.8h, #0 -; CHECK-GI-NEXT: ushll2 v12.4s, v27.8h, #0 -; CHECK-GI-NEXT: ushll v26.4s, v26.4h, #0 -; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v17.8h, #0 -; CHECK-GI-NEXT: mul v7.4s, v9.4s, v7.4s -; CHECK-GI-NEXT: ldr q9, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: sshll2 v5.4s, v19.8h, #0 -; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v20.8h, #0 -; CHECK-GI-NEXT: mul v2.4s, v9.4s, v2.4s -; CHECK-GI-NEXT: ldr q9, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll2 v15.4s, v8.8h, #0 -; CHECK-GI-NEXT: mul v0.4s, v18.4s, v0.4s -; CHECK-GI-NEXT: ldr q18, [sp, #160] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll2 v11.4s, v29.8h, #0 -; CHECK-GI-NEXT: sshll v9.4s, v9.4h, #0 -; CHECK-GI-NEXT: ushll2 v13.4s, v30.8h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v24.8h, #0 -; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v21.8h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v23.8h, #0 -; CHECK-GI-NEXT: mul v5.4s, v12.4s, v5.4s -; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0 -; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0 -; CHECK-GI-NEXT: mla v0.4s, v26.4s, v17.4s -; CHECK-GI-NEXT: mul v3.4s, v10.4s, v3.4s -; CHECK-GI-NEXT: mul v1.4s, v15.4s, v1.4s -; CHECK-GI-NEXT: mla v16.4s, v18.4s, v9.4s -; CHECK-GI-NEXT: ldp q18, q17, [sp, #192] // 32-byte Folded Reload -; CHECK-GI-NEXT: mul v4.4s, v11.4s, v4.4s -; CHECK-GI-NEXT: mul v6.4s, v13.4s, v6.4s -; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0 -; CHECK-GI-NEXT: ldp d13, d12, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: sshll v20.4s, v20.4h, #0 -; CHECK-GI-NEXT: ushll v10.4s, v14.4h, #0 -; CHECK-GI-NEXT: ldp d15, d14, [sp, #224] // 16-byte Folded Reload -; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0 -; CHECK-GI-NEXT: ushll v31.4s, v31.4h, #0 -; CHECK-GI-NEXT: ushll v29.4s, v29.4h, #0 -; CHECK-GI-NEXT: ushll v30.4s, v30.4h, #0 -; CHECK-GI-NEXT: sshll v25.4s, v25.4h, #0 -; CHECK-GI-NEXT: sshll v24.4s, v24.4h, #0 -; CHECK-GI-NEXT: sshll v22.4s, v22.4h, #0 -; CHECK-GI-NEXT: sshll v21.4s, v21.4h, #0 -; CHECK-GI-NEXT: sshll v23.4s, v23.4h, #0 -; CHECK-GI-NEXT: mla v5.4s, v27.4s, v19.4s -; CHECK-GI-NEXT: ldr q19, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: add v17.4s, v17.4s, v18.4s -; CHECK-GI-NEXT: ldr q18, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mla v3.4s, v28.4s, v20.4s -; CHECK-GI-NEXT: mla v7.4s, v10.4s, v25.4s -; CHECK-GI-NEXT: ldp d11, d10, [sp, #256] // 16-byte Folded Reload -; CHECK-GI-NEXT: mla v1.4s, v8.4s, v24.4s -; CHECK-GI-NEXT: ldp d9, d8, [sp, #272] // 16-byte Folded Reload -; CHECK-GI-NEXT: add v18.4s, v18.4s, v19.4s -; CHECK-GI-NEXT: ldp q20, q19, [sp, #64] // 32-byte Folded Reload -; CHECK-GI-NEXT: mla v2.4s, v31.4s, v22.4s -; CHECK-GI-NEXT: mla v4.4s, v29.4s, v21.4s -; CHECK-GI-NEXT: mla v6.4s, v30.4s, v23.4s -; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s -; CHECK-GI-NEXT: add v19.4s, v19.4s, v20.4s -; CHECK-GI-NEXT: ldr q20, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-GI-NEXT: add v16.4s, v20.4s, v16.4s -; CHECK-GI-NEXT: add v3.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: add v5.4s, v19.4s, v16.4s -; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: sshll v12.8h, v16.8b, #0 +; CHECK-GI-NEXT: sshll2 v18.8h, v16.16b, #0 +; CHECK-GI-NEXT: sshll v13.8h, v30.8b, #0 +; CHECK-GI-NEXT: sshll2 v30.8h, v30.16b, #0 +; CHECK-GI-NEXT: sshll v14.8h, v29.8b, #0 +; CHECK-GI-NEXT: sshll2 v29.8h, v29.16b, #0 +; CHECK-GI-NEXT: smull v23.4s, v1.4h, v5.4h +; CHECK-GI-NEXT: smull v15.4s, v3.4h, v7.4h +; CHECK-GI-NEXT: smull v24.4s, v2.4h, v6.4h +; CHECK-GI-NEXT: smull v17.4s, v31.4h, v11.4h +; CHECK-GI-NEXT: smull v6.4s, v25.4h, v19.4h +; CHECK-GI-NEXT: smull v16.4s, v8.4h, v12.4h +; CHECK-GI-NEXT: smull v4.4s, v22.4h, v18.4h +; CHECK-GI-NEXT: smull v5.4s, v9.4h, v13.4h +; CHECK-GI-NEXT: smull v2.4s, v21.4h, v30.4h +; CHECK-GI-NEXT: smull v1.4s, v10.4h, v14.4h +; CHECK-GI-NEXT: smull v0.4s, v20.4h, v29.4h +; CHECK-GI-NEXT: smlal2 v26.4s, v27.8h, v28.8h +; CHECK-GI-NEXT: ldr q28, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q27, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: smlal2 v15.4s, v3.8h, v7.8h +; CHECK-GI-NEXT: ldp q7, q3, [sp, #128] // 32-byte Folded Reload +; CHECK-GI-NEXT: smlal2 v23.4s, v27.8h, v28.8h +; CHECK-GI-NEXT: ldr q28, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q27, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: smlal2 v17.4s, v31.8h, v11.8h +; CHECK-GI-NEXT: smlal2 v6.4s, v25.8h, v19.8h +; CHECK-GI-NEXT: smlal2 v16.4s, v8.8h, v12.8h +; CHECK-GI-NEXT: smlal2 v24.4s, v27.8h, v28.8h +; CHECK-GI-NEXT: smlal2 v4.4s, v22.8h, v18.8h +; CHECK-GI-NEXT: ldr q18, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: smlal2 v5.4s, v9.8h, v13.8h +; CHECK-GI-NEXT: ldp d9, d8, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d13, d12, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: smlal2 v2.4s, v21.8h, v30.8h +; CHECK-GI-NEXT: smlal2 v1.4s, v10.8h, v14.8h +; CHECK-GI-NEXT: ldp d11, d10, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: smlal2 v0.4s, v20.8h, v29.8h +; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s +; CHECK-GI-NEXT: ldr q7, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: add v19.4s, v24.4s, v15.4s +; CHECK-GI-NEXT: ldp d15, d14, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: add v7.4s, v7.4s, v18.4s +; CHECK-GI-NEXT: add v18.4s, v26.4s, v23.4s +; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s +; CHECK-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-GI-NEXT: add v2.4s, v5.4s, v2.4s ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: addv s1, v2.4s +; CHECK-GI-NEXT: add v1.4s, v3.4s, v7.4s +; CHECK-GI-NEXT: add v3.4s, v18.4s, v19.4s +; CHECK-GI-NEXT: add v4.4s, v6.4s, v4.4s +; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: fmov w9, s0 ; CHECK-GI-NEXT: add w0, w8, w9 -; CHECK-GI-NEXT: add sp, sp, #304 +; CHECK-GI-NEXT: add sp, sp, #240 ; CHECK-GI-NEXT: ret entry: %az = zext <64 x i8> %a to <64 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll index f83ac8ed642cc..c82f8e19f329a 100644 --- a/llvm/test/CodeGen/AArch64/neon-extmul.ll +++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll @@ -57,14 +57,10 @@ define <8 x i32> @extmulsu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; ; CHECK-GI-LABEL: extmulsu_v8i8_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-GI-NEXT: smull v0.4s, v2.4h, v1.4h +; CHECK-GI-NEXT: smull2 v1.4s, v2.8h, v1.8h ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i32> @@ -138,12 +134,8 @@ define <8 x i32> @extmuladdsu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1, <8 x i32> %b) ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: mla v2.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: mla v3.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: smlal v2.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smlal2 v3.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: mov v1.16b, v3.16b ; CHECK-GI-NEXT: ret @@ -242,48 +234,12 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll v4.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll v5.2d, v3.2s, #0 -; CHECK-GI-NEXT: sshll2 v2.2d, v2.4s, #0 -; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 -; CHECK-GI-NEXT: sshll v6.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll v7.2d, v1.2s, #0 -; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-GI-NEXT: fmov x8, d4 -; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: mov x12, v4.d[1] -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: fmov x11, d7 -; CHECK-GI-NEXT: mov x13, v5.d[1] -; CHECK-GI-NEXT: fmov x14, d1 -; CHECK-GI-NEXT: mov x15, v2.d[1] -; CHECK-GI-NEXT: mov x16, v3.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mov x17, v7.d[1] -; CHECK-GI-NEXT: mov x18, v1.d[1] -; CHECK-GI-NEXT: mul x12, x12, x13 -; CHECK-GI-NEXT: mov x13, v0.d[1] -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov x10, d6 -; CHECK-GI-NEXT: mul x15, x15, x16 -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: fmov x11, d0 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v1.d[0], x9 -; CHECK-GI-NEXT: mul x13, x13, x18 -; CHECK-GI-NEXT: mul x11, x11, x14 -; CHECK-GI-NEXT: mov x14, v6.d[1] -; CHECK-GI-NEXT: mov v0.d[1], x12 -; CHECK-GI-NEXT: mov v2.d[0], x10 -; CHECK-GI-NEXT: mov v1.d[1], x15 -; CHECK-GI-NEXT: mul x14, x14, x17 -; CHECK-GI-NEXT: mov v3.d[0], x11 -; CHECK-GI-NEXT: mov v2.d[1], x14 -; CHECK-GI-NEXT: mov v3.d[1], x13 +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: smull v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: smull2 v1.2d, v2.4s, v3.4s +; CHECK-GI-NEXT: smull v2.2d, v4.2s, v5.2s +; CHECK-GI-NEXT: smull2 v3.2d, v4.4s, v5.4s ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> @@ -395,50 +351,14 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) ; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: sshll v16.2d, v6.2s, #0 -; CHECK-GI-NEXT: ushll v17.2d, v7.2s, #0 -; CHECK-GI-NEXT: sshll2 v6.2d, v6.4s, #0 -; CHECK-GI-NEXT: ushll2 v7.2d, v7.4s, #0 -; CHECK-GI-NEXT: sshll v18.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll v19.2d, v1.2s, #0 -; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-GI-NEXT: fmov x8, d16 -; CHECK-GI-NEXT: fmov x9, d17 -; CHECK-GI-NEXT: mov x12, v16.d[1] -; CHECK-GI-NEXT: fmov x10, d7 -; CHECK-GI-NEXT: fmov x11, d19 -; CHECK-GI-NEXT: mov x13, v17.d[1] -; CHECK-GI-NEXT: fmov x14, d1 -; CHECK-GI-NEXT: mov x15, v6.d[1] -; CHECK-GI-NEXT: mov x16, v7.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d6 -; CHECK-GI-NEXT: mov x17, v19.d[1] -; CHECK-GI-NEXT: mov x18, v1.d[1] -; CHECK-GI-NEXT: mul x12, x12, x13 -; CHECK-GI-NEXT: mov x13, v0.d[1] -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov x10, d18 -; CHECK-GI-NEXT: mul x15, x15, x16 -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: fmov x11, d0 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v1.d[0], x9 -; CHECK-GI-NEXT: mul x13, x13, x18 -; CHECK-GI-NEXT: mul x11, x11, x14 -; CHECK-GI-NEXT: mov x14, v18.d[1] -; CHECK-GI-NEXT: mov v0.d[1], x12 -; CHECK-GI-NEXT: mov v6.d[0], x10 -; CHECK-GI-NEXT: mov v1.d[1], x15 -; CHECK-GI-NEXT: mul x14, x14, x17 -; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-GI-NEXT: mov v7.d[0], x11 -; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d -; CHECK-GI-NEXT: mov v6.d[1], x14 -; CHECK-GI-NEXT: mov v7.d[1], x13 -; CHECK-GI-NEXT: add v2.2d, v6.2d, v4.2d -; CHECK-GI-NEXT: add v3.2d, v7.2d, v5.2d +; CHECK-GI-NEXT: smlal v2.2d, v6.2s, v7.2s +; CHECK-GI-NEXT: smlal2 v3.2d, v6.4s, v7.4s +; CHECK-GI-NEXT: smlal v4.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: smlal2 v5.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: mov v2.16b, v4.16b +; CHECK-GI-NEXT: mov v3.16b, v5.16b ; CHECK-GI-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64>