diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 293292d47dd48..0905901c5f69b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6235,6 +6235,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getNode( AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2))); return SDValue(); + case Intrinsic::aarch64_neon_sqadd: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::SADDSAT, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + return SDValue(); + case Intrinsic::aarch64_neon_sqsub: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::SSUBSAT, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + return SDValue(); + case Intrinsic::aarch64_neon_uqadd: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::UADDSAT, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + return SDValue(); + case Intrinsic::aarch64_neon_uqsub: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::USUBSAT, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + return SDValue(); case Intrinsic::aarch64_sve_whilelt: return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true, /*IsEqual=*/false); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 5489541fcb318..6adf84879052f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6256,24 +6256,6 @@ multiclass SIMDThreeSameVector opc, string asm, [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorExtraPatterns { - def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)), - (!cast(inst#"v8i8") V64:$LHS, V64:$RHS)>; - def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), - (!cast(inst#"v4i16") V64:$LHS, V64:$RHS)>; - def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), - (!cast(inst#"v2i32") V64:$LHS, V64:$RHS)>; - - def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)), - (!cast(inst#"v16i8") V128:$LHS, V128:$RHS)>; - def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), - (!cast(inst#"v8i16") V128:$LHS, V128:$RHS)>; - def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), - (!cast(inst#"v4i32") V128:$LHS, V128:$RHS)>; - def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), - (!cast(inst#"v2i64") V128:$LHS, V128:$RHS)>; -} - // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS opc, string asm, SDPatternOperator OpNode> { @@ -9861,14 +9843,15 @@ multiclass SIMDIndexedLongSD opc, string asm, } multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator VecAcc, + SDPatternOperator ScalAcc> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V128, V64, V128_lo, VectorIndexH, asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqdmull + (VecAcc (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> { @@ -9883,8 +9866,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqdmull + (VecAcc (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 (v8i16 V128:$Rn)), (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> { bits<3> idx; @@ -9898,8 +9881,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, V128, VectorIndexS, asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$dst), - (Accum (v2i64 V128:$Rd), - (v2i64 (int_aarch64_neon_sqdmull + (VecAcc (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn), (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> { bits<2> idx; @@ -9912,8 +9895,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$dst), - (Accum (v2i64 V128:$Rd), - (v2i64 (int_aarch64_neon_sqdmull + (VecAcc (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 (v4i32 V128:$Rn)), (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> { bits<2> idx; @@ -9930,8 +9913,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, let Inst{20} = idx{0}; } - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract + def : Pat<(i32 (ScalAcc (i32 FPR32Op:$Rd), + (i32 (vector_extract (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), (v4i16 V64:$Rm))), @@ -9942,8 +9925,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), (i64 0))>; - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract + def : Pat<(i32 (ScalAcc (i32 FPR32Op:$Rd), + (i32 (vector_extract (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), (dup_v8i16 (v8i16 V128_lo:$Rm), @@ -9959,8 +9942,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, FPR64Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i64 FPR64Op:$dst), - (Accum (i64 FPR64Op:$Rd), - (i64 (int_aarch64_neon_sqdmulls_scalar + (ScalAcc (i64 FPR64Op:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32Op:$Rn), (i32 (vector_extract (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 010c7c391527f..9b256b2a7a878 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5811,12 +5811,12 @@ defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>; defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>; -defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>; +defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", saddsat>; defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>; defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; -defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; +defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", ssubsat>; defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", avgceils>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; @@ -5830,10 +5830,10 @@ defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>; defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>; -defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; +defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", uaddsat>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; -defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; +defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", usubsat>; defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", avgceilu>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; @@ -5842,12 +5842,6 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", int_aarch64_neon_sqrdmlsh>; -// Extra saturate patterns, other than the intrinsics matches above -defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>; -defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>; -defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>; -defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; - defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; @@ -6563,10 +6557,8 @@ defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>; -defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", - int_aarch64_neon_sqadd>; -defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", - int_aarch64_neon_sqsub>; +defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>; +defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>; defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>; defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", @@ -8125,9 +8117,9 @@ defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>; -defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", +defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", saddsat, int_aarch64_neon_sqadd>; -defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", +defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", ssubsat, int_aarch64_neon_sqsub>; defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah", int_aarch64_neon_sqrdmlah>; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 7ff2e55e802c5..93f4bc423b63c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1622,8 +1622,10 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { - auto LowerBinOp = [&MI](unsigned Opcode) { - MachineIRBuilder MIB(MI); + MachineIRBuilder &MIB = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIB.getMRI(); + + auto LowerBinOp = [&MI, &MIB](unsigned Opcode) { MIB.buildInstr(Opcode, {MI.getOperand(0)}, {MI.getOperand(2), MI.getOperand(3)}); MI.eraseFromParent(); @@ -1642,7 +1644,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineFunction &MF = *MI.getMF(); auto Val = MF.getRegInfo().createGenericVirtualRegister( LLT::scalar(VaListSize * 8)); - MachineIRBuilder MIB(MI); MIB.buildLoad(Val, MI.getOperand(2), *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, @@ -1655,7 +1656,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return true; } case Intrinsic::get_dynamic_area_offset: { - MachineIRBuilder &MIB = Helper.MIRBuilder; MIB.buildConstant(MI.getOperand(0).getReg(), 0); MI.eraseFromParent(); return true; @@ -1664,14 +1664,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); // Anyext the value being set to 64 bit (only the bottom 8 bits are read by // the instruction). - MachineIRBuilder MIB(MI); auto &Value = MI.getOperand(3); Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); Value.setReg(ExtValueReg); return true; } case Intrinsic::aarch64_prefetch: { - MachineIRBuilder MIB(MI); auto &AddrVal = MI.getOperand(1); int64_t IsWrite = MI.getOperand(2).getImm(); @@ -1694,8 +1692,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::aarch64_neon_smaxv: case Intrinsic::aarch64_neon_uminv: case Intrinsic::aarch64_neon_sminv: { - MachineIRBuilder MIB(MI); - MachineRegisterInfo &MRI = *MIB.getMRI(); bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv || IntrinsicID == Intrinsic::aarch64_neon_smaxv || IntrinsicID == Intrinsic::aarch64_neon_sminv; @@ -1720,8 +1716,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, } case Intrinsic::aarch64_neon_uaddlp: case Intrinsic::aarch64_neon_saddlp: { - MachineIRBuilder MIB(MI); - unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp ? AArch64::G_UADDLP : AArch64::G_SADDLP; @@ -1732,9 +1726,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, } case Intrinsic::aarch64_neon_uaddlv: case Intrinsic::aarch64_neon_saddlv: { - MachineIRBuilder MIB(MI); - MachineRegisterInfo &MRI = *MIB.getMRI(); - unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv ? AArch64::G_UADDLV : AArch64::G_SADDLV; @@ -1790,11 +1781,30 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(AArch64::G_UMULL); case Intrinsic::aarch64_neon_abs: { // Lower the intrinsic to G_ABS. - MachineIRBuilder MIB(MI); MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)}); MI.eraseFromParent(); return true; } + case Intrinsic::aarch64_neon_sqadd: { + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + return LowerBinOp(TargetOpcode::G_SADDSAT); + break; + } + case Intrinsic::aarch64_neon_sqsub: { + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + return LowerBinOp(TargetOpcode::G_SSUBSAT); + break; + } + case Intrinsic::aarch64_neon_uqadd: { + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + return LowerBinOp(TargetOpcode::G_UADDSAT); + break; + } + case Intrinsic::aarch64_neon_uqsub: { + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + return LowerBinOp(TargetOpcode::G_USUBSAT); + break; + } case Intrinsic::vector_reverse: // TODO: Add support for vector_reverse diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll index 9fb8e4c8fe031..bd28d13973f9c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -2539,16 +2539,16 @@ define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coer ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsr x8, x0, #16 ; CHECK-NEXT: movi v1.2d, #0xffff0000ffff0000 -; CHECK-NEXT: fmov d5, x0 ; CHECK-NEXT: rev32 v4.8h, v0.8h ; CHECK-NEXT: dup v2.8h, w8 ; CHECK-NEXT: sqneg v3.8h, v2.8h ; CHECK-NEXT: bsl v1.16b, v2.16b, v3.16b -; CHECK-NEXT: sqdmull v2.4s, v0.4h, v5.h[0] -; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v5.h[0] -; CHECK-NEXT: sqdmlal v2.4s, v4.4h, v1.4h -; CHECK-NEXT: sqdmlal2 v0.4s, v4.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: fmov d3, x0 +; CHECK-NEXT: sqdmull v2.4s, v4.4h, v1.4h +; CHECK-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h +; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] +; CHECK-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] +; CHECK-NEXT: uzp2 v0.8h, v2.8h, v1.8h ; CHECK-NEXT: ret entry: %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16 diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 499786470d4ac..937a17ca6c1e0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -2863,3 +2863,187 @@ define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { %prod = mul <1 x i64> %lhs, %rhs ret <1 x i64> %prod } + +define <4 x i32> @sqdmlal4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { +; CHECK-LABEL: sqdmlal4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal.4s v0, v1, v2 +; CHECK-NEXT: ret + %tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2) + %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlal2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { +; CHECK-LABEL: sqdmlal2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal.2d v0, v1, v2 +; CHECK-NEXT: ret + %tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2) + %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlal2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { +; CHECK-LABEL: sqdmlal2_4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal2.4s v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> + %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> + %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1) + %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlal2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: sqdmlal2_2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal2.2d v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> + %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> + %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1) + %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlal_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { +; CHECK-LABEL: sqdmlal_lane_4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sqdmlal.4s v0, v1, v2[3] +; CHECK-NEXT: ret + %tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> + %tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0) + %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp1) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlal_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { +; CHECK-LABEL: sqdmlal_lane_2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: ret + %tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> + %tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0) + %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp1) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlal2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { +; CHECK-LABEL: sqdmlal2_lane_4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal2.4s v0, v1, v2[7] +; CHECK-NEXT: ret + %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> + %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> + %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1) + %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlal2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: sqdmlal2_lane_2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[1] +; CHECK-NEXT: ret + %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> + %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> + %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1) + %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlsl4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { +; CHECK-LABEL: sqdmlsl4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 +; CHECK-NEXT: ret + %tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2) + %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlsl2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { +; CHECK-LABEL: sqdmlsl2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 +; CHECK-NEXT: ret + %tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2) + %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlsl2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { +; CHECK-LABEL: sqdmlsl2_4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlsl2.4s v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> + %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> + %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1) + %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlsl2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: sqdmlsl2_2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlsl2.2d v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> + %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> + %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1) + %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlsl_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { +; CHECK-LABEL: sqdmlsl_lane_4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[3] +; CHECK-NEXT: ret + %tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> + %tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0) + %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp1) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlsl_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { +; CHECK-LABEL: sqdmlsl_lane_2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: ret + %tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> + %tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0) + %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp1) + ret <2 x i64> %sum +} + +define <4 x i32> @sqdmlsl2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { +; CHECK-LABEL: sqdmlsl2_lane_4s_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlsl2.4s v0, v1, v2[7] +; CHECK-NEXT: ret + %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> + %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> + %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1) + %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2) + ret <4 x i32> %sum +} + +define <2 x i64> @sqdmlsl2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: sqdmlsl2_lane_2d_lib: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlsl2.2d v0, v1, v2[1] +; CHECK-NEXT: ret + %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> + %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> + %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1) + %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2) + ret <2 x i64> %sum +}