diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bc7cdf38dbc2a..ce626ec6590ff 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -545,6 +545,7 @@ namespace { SDValue visitMGATHER(SDNode *N); SDValue visitMSCATTER(SDNode *N); SDValue visitMHISTOGRAM(SDNode *N); + SDValue visitPARTIAL_REDUCE_MLA(SDNode *N); SDValue visitVPGATHER(SDNode *N); SDValue visitVPSCATTER(SDNode *N); SDValue visitVP_STRIDED_LOAD(SDNode *N); @@ -1972,6 +1973,9 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MSCATTER: return visitMSCATTER(N); case ISD::MSTORE: return visitMSTORE(N); case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N); + case ISD::PARTIAL_REDUCE_SMLA: + case ISD::PARTIAL_REDUCE_UMLA: + return visitPARTIAL_REDUCE_MLA(N); case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N); case ISD::LIFETIME_END: return visitLIFETIME_END(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); @@ -12497,6 +12501,58 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) { return SDValue(); } +// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(LHSExtOp), ZEXT(RHSExtOp)), +// Splat(1)) into +// PARTIAL_REDUCE_UMLA(Acc, LHSExtOp, RHSExtOp). +// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(LHSExtOp), SEXT(RHSExtOp)), +// Splat(1)) into +// PARTIAL_REDUCE_SMLA(Acc, LHSExtOp, RHSExtOp). +SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) { + SDLoc DL(N); + + SDValue Acc = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Op2 = N->getOperand(2); + + APInt ConstantOne; + if (Op1->getOpcode() != ISD::MUL || + !ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) || + !ConstantOne.isOne()) + return SDValue(); + + SDValue LHS = Op1->getOperand(0); + SDValue RHS = Op1->getOperand(1); + unsigned LHSOpcode = LHS->getOpcode(); + unsigned RHSOpcode = RHS->getOpcode(); + if (!ISD::isExtOpcode(LHSOpcode) || !ISD::isExtOpcode(RHSOpcode)) + return SDValue(); + + SDValue LHSExtOp = LHS->getOperand(0); + SDValue RHSExtOp = RHS->getOperand(0); + EVT LHSExtOpVT = LHSExtOp.getValueType(); + if (LHSExtOpVT != RHSExtOp.getValueType() || LHSOpcode != RHSOpcode) + return SDValue(); + + // FIXME: Add a check to only perform the DAG combine if there is lowering + // provided by the target + + bool ExtIsSigned = LHSOpcode == ISD::SIGN_EXTEND; + + // For a 2-stage extend the signedness of both of the extends must be the + // same. This is so the node can be folded into only a signed or unsigned + // node. + bool NodeIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA; + EVT AccElemVT = Acc.getValueType().getVectorElementType(); + if (ExtIsSigned != NodeIsSigned && + Op1.getValueType().getVectorElementType() != AccElemVT) + return SDValue(); + + unsigned NewOpcode = + ExtIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA; + return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp, + RHSExtOp); +} + SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) { auto *SLD = cast(N); EVT EltVT = SLD->getValueType(0).getVectorElementType(); diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 40daf8ffb63ea..3938a57d0152c 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; ; CHECK-NODOT-LABEL: udot: ; CHECK-NODOT: // %bb.0: -; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b -; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b -; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h -; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h -; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h -; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h +; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h +; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h +; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h +; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NODOT-NEXT: ret %u.wide = zext <16 x i8> %u to <16 x i32> %s.wide = zext <16 x i8> %s to <16 x i32> @@ -35,17 +37,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { ; ; CHECK-NODOT-LABEL: udot_narrow: ; CHECK-NODOT: // %bb.0: -; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b +; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h +; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h +; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h +; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8 +; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-NODOT-NEXT: ret %u.wide = zext <8 x i8> %u to <8 x i32> %s.wide = zext <8 x i8> %s to <8 x i32> @@ -62,13 +66,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; ; CHECK-NODOT-LABEL: sdot: ; CHECK-NODOT: // %bb.0: -; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b -; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b -; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h -; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h -; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h +; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h +; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h +; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h +; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NODOT-NEXT: ret %u.wide = sext <16 x i8> %u to <16 x i32> %s.wide = sext <16 x i8> %s to <16 x i32> @@ -85,17 +91,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { ; ; CHECK-NODOT-LABEL: sdot_narrow: ; CHECK-NODOT: // %bb.0: -; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b +; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h +; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h +; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h +; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h +; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8 +; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-NODOT-NEXT: ret %u.wide = sext <8 x i8> %u to <8 x i32> %s.wide = sext <8 x i8> %s to <8 x i32> @@ -223,19 +231,27 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; ; CHECK-NODOT-LABEL: udot_8to64: ; CHECK-NODOT: // %bb.0: // %entry -; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b -; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b -; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0 -; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0 +; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s -; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d +; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0 +; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s +; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s +; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s +; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s +; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s +; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s +; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s +; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s +; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d ; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d ; CHECK-NODOT-NEXT: ret entry: @@ -258,19 +274,27 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; ; CHECK-NODOT-LABEL: sdot_8to64: ; CHECK-NODOT: // %bb.0: // %entry -; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b -; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b -; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0 -; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0 +; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s -; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d +; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0 +; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0 +; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s +; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s +; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s +; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s +; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s +; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s +; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s +; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s +; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d ; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d ; CHECK-NODOT-NEXT: ret entry: @@ -531,9 +555,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-LABEL: not_udot: ; CHECK: // %bb.0: -; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h +; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h ; CHECK-NEXT: ret %u.wide = zext <8 x i8> %u to <8 x i32> %s.wide = zext <8 x i8> %s to <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 455231dd37be6..d7bab3297cf29 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -11,24 +11,23 @@ define @udot( %acc, %a, ; ; CHECK-NEWLOWERING-LABEL: udot: ; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b -; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b -; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z1.b ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b ; CHECK-NEWLOWERING-NEXT: ptrue p0.s ; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h -; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h -; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s -; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s -; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z6.s, z1.h +; CHECK-NEWLOWERING-NEXT: mul z3.s, z4.s, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s +; CHECK-NEWLOWERING-NEXT: mad z1.s, p0/m, z2.s, z3.s ; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEWLOWERING-NEXT: ret entry: @@ -47,24 +46,23 @@ define @udot_wide( %acc, ; ; CHECK-NEWLOWERING-LABEL: udot_wide: ; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h -; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h ; CHECK-NEWLOWERING-NEXT: ptrue p0.d ; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s -; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s -; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d -; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d -; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z1.s +; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d +; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d +; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d ; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEWLOWERING-NEXT: ret entry: @@ -83,24 +81,23 @@ define @sdot( %accc, %a, ; ; CHECK-NEWLOWERING-LABEL: sdot: ; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b -; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b -; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z1.b ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b ; CHECK-NEWLOWERING-NEXT: ptrue p0.s ; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h -; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h -; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h -; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h -; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h -; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h -; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s -; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s -; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z6.s, z1.h +; CHECK-NEWLOWERING-NEXT: mul z3.s, z4.s, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpklo z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s +; CHECK-NEWLOWERING-NEXT: mad z1.s, p0/m, z2.s, z3.s ; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEWLOWERING-NEXT: ret entry: @@ -119,24 +116,23 @@ define @sdot_wide( %acc, ; ; CHECK-NEWLOWERING-LABEL: sdot_wide: ; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h -; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h -; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z1.h ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEWLOWERING-NEXT: ptrue p0.d ; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s -; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s -; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s -; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s -; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s -; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s -; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d -; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d -; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z1.s +; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d +; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpklo z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d +; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d ; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEWLOWERING-NEXT: ret entry: @@ -278,59 +274,46 @@ define @udot_8to64( %acc, %a to @@ -354,59 +337,46 @@ define @sdot_8to64( %acc, %a to @@ -875,11 +845,11 @@ define @not_udot( %acc, % ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z2.h, z2.h, #0xff ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z2.h +; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: mla z0.s, p0/m, z4.s, z3.s ; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret ; @@ -888,11 +858,11 @@ define @not_udot( %acc, % ; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff ; CHECK-NEWLOWERING-NEXT: ptrue p0.s -; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h -; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z4.s, z3.s ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s ; CHECK-NEWLOWERING-NEXT: ret entry: @@ -909,11 +879,11 @@ define @not_udot_wide( %acc, @not_udot_wide( %acc, @udot_nxv8i8_promote ( %acc, %a to @@ -1321,17 +1305,24 @@ define @sdot_nxv8i8_promote ( %acc, @sdot_nxv8i8_promote ( %acc, %a to