diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 57a4c6f7a4869..0602f50ed1603 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41903,7 +41903,8 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget); + const X86Subtarget &Subtarget, + unsigned Depth = 0); /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, @@ -57791,7 +57792,8 @@ CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { + const X86Subtarget &Subtarget, + unsigned Depth) { assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); unsigned EltSizeInBits = VT.getScalarSizeInBits(); @@ -57803,6 +57805,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, })) return getZeroVector(VT, Subtarget, DAG, DL); + if (Depth >= SelectionDAG::MaxRecursionDepth) + return SDValue(); // Limit search depth. + SDValue Op0 = Ops[0]; bool IsSplat = llvm::all_equal(Ops); unsigned NumOps = Ops.size(); @@ -57933,6 +57938,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } return AllConstants || AllSubs; }; + auto CombineSubOperand = [&](MVT VT, ArrayRef SubOps, unsigned I) { + bool AllConstants = true; + SmallVector Subs; + for (SDValue SubOp : SubOps) { + SDValue BC = peekThroughBitcasts(SubOp.getOperand(I)); + AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode()); + Subs.push_back(SubOp.getOperand(I)); + } + if (AllConstants) + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); + return combineConcatVectorOps(DL, VT, Subs, DAG, DCI, Subtarget, + Depth + 1); + }; switch (Op0.getOpcode()) { case ISD::VECTOR_SHUFFLE: { @@ -58354,14 +58373,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case ISD::FADD: case ISD::FSUB: case ISD::FMUL: - if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) && - (VT.is256BitVector() || - (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { - return DAG.getNode(Op0.getOpcode(), DL, VT, - ConcatSubOperand(VT, Ops, 0), - ConcatSubOperand(VT, Ops, 1)); + if (!IsSplat && (VT.is256BitVector() || + (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { + SDValue Concat0 = CombineSubOperand(VT, Ops, 0); + SDValue Concat1 = CombineSubOperand(VT, Ops, 1); + if (Concat0 || Concat1) + return DAG.getNode(Op0.getOpcode(), DL, VT, + Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0), + Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1)); } break; + // Always prefer to concatenate high latency FDIV instructions. case ISD::FDIV: if (!IsSplat && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 7a5819c2978ae..1ee03c5f1223f 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1OR2,AVX1 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX1OR2,AVX2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; ; Basic matrix multiply tests based on the pattern: @@ -117,22 +117,38 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin ; SSE-NEXT: movapd %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_mul2x2_f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] -; AVX-NEXT: vmulpd %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,1] -; AVX-NEXT: vmulpd %xmm4, %xmm2, %xmm4 -; AVX-NEXT: vaddpd %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] -; AVX-NEXT: vmulpd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,1] -; AVX-NEXT: vmulpd %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_mul2x2_f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul2x2_f64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mul2x2_f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX512-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq entry: %split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> %split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> @@ -958,227 +974,58 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: retq ; -; AVX1-LABEL: test_mul4x4_f32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0] -; AVX1-NEXT: vmulps %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX1-NEXT: vmulps %xmm7, %xmm5, %xmm7 -; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-NEXT: vmulps %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3] -; AVX1-NEXT: vmulps %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0] -; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1] -; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8 -; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2] -; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8 -; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-NEXT: vmulps %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vaddps %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0] -; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8 -; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] -; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8 -; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3] -; AVX1-NEXT: vmulps %xmm4, %xmm8, %xmm8 -; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0] -; AVX1-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vaddps %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX1-NEXT: vmulps %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX1-NEXT: vmulps %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_mul4x4_f32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vbroadcastss %xmm2, %xmm6 -; AVX2-NEXT: vmulps %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX2-NEXT: vmulps %xmm7, %xmm5, %xmm7 -; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX2-NEXT: vmulps %xmm7, %xmm1, %xmm7 -; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3] -; AVX2-NEXT: vmulps %xmm7, %xmm4, %xmm7 -; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1] -; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8 -; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2] -; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8 -; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-NEXT: vmulps %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vaddps %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vbroadcastss %xmm3, %xmm7 -; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8 -; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] -; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8 -; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3] -; AVX2-NEXT: vmulps %xmm4, %xmm8, %xmm8 -; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm5 -; AVX2-NEXT: vaddps %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX2-NEXT: vmulps %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX2-NEXT: vmulps %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: test_mul4x4_f32: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512F-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX512F-NEXT: vmulps %xmm5, %xmm0, %xmm5 -; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX512F-NEXT: vmulps %xmm6, %xmm4, %xmm6 -; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2] -; AVX512F-NEXT: vmulps %xmm6, %xmm3, %xmm6 -; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3] -; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6 -; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX512F-NEXT: vbroadcastss %xmm6, %xmm7 -; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm7 -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1] -; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm8 -; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2] -; AVX512F-NEXT: vmulps %xmm3, %xmm8, %xmm8 -; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] -; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6 -; AVX512F-NEXT: vaddps %xmm6, %xmm7, %xmm6 -; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm7 -; AVX512F-NEXT: vbroadcastss %xmm7, %xmm8 -; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm8 -; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512F-NEXT: vmulps %xmm4, %xmm9, %xmm9 -; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8 -; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2] -; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9 -; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8 -; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3] -; AVX512F-NEXT: vmulps %xmm7, %xmm2, %xmm7 -; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm1 -; AVX512F-NEXT: vbroadcastss %xmm1, %xmm8 -; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm4 -; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] -; AVX512F-NEXT: vmulps %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vaddps %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX512F-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: retq +; AVX1OR2-LABEL: test_mul4x4_f32: +; AVX1OR2: # %bb.0: # %entry +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX1OR2-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4] +; AVX1OR2-NEXT: vmulps %ymm0, %ymm6, %ymm0 +; AVX1OR2-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6] +; AVX1OR2-NEXT: vmulps %ymm7, %ymm4, %ymm7 +; AVX1OR2-NEXT: vaddps %ymm7, %ymm0, %ymm0 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1OR2-NEXT: vmulps %ymm2, %ymm1, %ymm2 +; AVX1OR2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm2 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX1OR2-NEXT: vmulps %ymm5, %ymm6, %ymm5 +; AVX1OR2-NEXT: vaddps %ymm2, %ymm5, %ymm2 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6] +; AVX1OR2-NEXT: vmulps %ymm5, %ymm4, %ymm4 +; AVX1OR2-NEXT: vaddps %ymm4, %ymm2, %ymm2 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX1OR2-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq ; -; AVX512VL-LABEL: test_mul4x4_f32: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm4 -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX512VL-NEXT: vmulps %xmm5, %xmm0, %xmm5 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX512VL-NEXT: vmulps %xmm6, %xmm2, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2] -; AVX512VL-NEXT: vmulps %xmm6, %xmm3, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5 -; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX512VL-NEXT: vbroadcastss %xmm6, %xmm7 -; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm7 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1] -; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm8 -; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2] -; AVX512VL-NEXT: vmulps %xmm3, %xmm8, %xmm8 -; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm7, %xmm6 -; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm7 -; AVX512VL-NEXT: vbroadcastss %xmm7, %xmm8 -; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm8 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512VL-NEXT: vmulps %xmm2, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2] -; AVX512VL-NEXT: vmulps %xmm3, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3] -; AVX512VL-NEXT: vmulps %xmm7, %xmm4, %xmm7 -; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 -; AVX512VL-NEXT: vextractf32x4 $3, %zmm1, %xmm1 -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm8 -; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2] -; AVX512VL-NEXT: vmulps %xmm2, %xmm3, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vmulps %xmm1, %xmm4, %xmm1 -; AVX512VL-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_mul4x4_f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vshufps {{.*#+}} zmm2 = zmm1[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[2,3,2,3,2,3,2,3] +; AVX512-NEXT: vmulps %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vshufps {{.*#+}} zmm3 = zmm1[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm4 = zmm0[0,1,0,1,0,1,0,1] +; AVX512-NEXT: vmulps %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vaddps %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vshufps {{.*#+}} zmm3 = zmm1[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm4 = zmm0[4,5,4,5,4,5,4,5] +; AVX512-NEXT: vmulps %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vaddps %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vshufps {{.*#+}} zmm1 = zmm1[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: retq entry: %split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> %split1 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> @@ -1472,113 +1319,42 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; AVX2-NEXT: vmovapd %ymm6, %ymm2 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_mul4x4_f64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; AVX512F-NEXT: vbroadcastsd %xmm2, %ymm6 -; AVX512F-NEXT: vmulpd %ymm6, %ymm0, %ymm6 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1] -; AVX512F-NEXT: vmulpd %ymm7, %ymm5, %ymm7 -; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2] -; AVX512F-NEXT: vmulpd %ymm7, %ymm1, %ymm7 -; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3] -; AVX512F-NEXT: vmulpd %ymm7, %ymm4, %ymm7 -; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vextractf64x4 $1, %zmm2, %ymm2 -; AVX512F-NEXT: vbroadcastsd %xmm2, %ymm7 -; AVX512F-NEXT: vmulpd %ymm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1] -; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm8 -; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2] -; AVX512F-NEXT: vmulpd %ymm1, %ymm8, %ymm8 -; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-NEXT: vmulpd %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vaddpd %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vbroadcastsd %xmm3, %ymm7 -; AVX512F-NEXT: vmulpd %ymm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] -; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm8 -; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2] -; AVX512F-NEXT: vmulpd %ymm1, %ymm8, %ymm8 -; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3] -; AVX512F-NEXT: vmulpd %ymm4, %ymm8, %ymm8 -; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512F-NEXT: vextractf64x4 $1, %zmm3, %ymm3 -; AVX512F-NEXT: vbroadcastsd %xmm3, %ymm8 -; AVX512F-NEXT: vmulpd %ymm0, %ymm8, %ymm0 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] -; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm5 -; AVX512F-NEXT: vaddpd %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[2,2,2,2] -; AVX512F-NEXT: vmulpd %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3] -; AVX512F-NEXT: vmulpd %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm6, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: test_mul4x4_f64: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; AVX512VL-NEXT: vextractf64x4 $1, %zmm1, %ymm5 -; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm6 -; AVX512VL-NEXT: vmulpd %ymm6, %ymm0, %ymm6 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1] -; AVX512VL-NEXT: vmulpd %ymm7, %ymm4, %ymm7 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2] -; AVX512VL-NEXT: vmulpd %ymm7, %ymm1, %ymm7 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3] -; AVX512VL-NEXT: vmulpd %ymm7, %ymm5, %ymm7 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6 -; AVX512VL-NEXT: vextractf64x4 $1, %zmm2, %ymm2 -; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm7 -; AVX512VL-NEXT: vmulpd %ymm7, %ymm0, %ymm7 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1] -; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm8 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2] -; AVX512VL-NEXT: vmulpd %ymm1, %ymm8, %ymm8 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512VL-NEXT: vmulpd %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vaddpd %ymm2, %ymm7, %ymm2 -; AVX512VL-NEXT: vbroadcastsd %xmm3, %ymm7 -; AVX512VL-NEXT: vmulpd %ymm7, %ymm0, %ymm7 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] -; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm8 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2] -; AVX512VL-NEXT: vmulpd %ymm1, %ymm8, %ymm8 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3] -; AVX512VL-NEXT: vmulpd %ymm5, %ymm8, %ymm8 -; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 -; AVX512VL-NEXT: vextractf64x4 $1, %zmm3, %ymm3 -; AVX512VL-NEXT: vbroadcastsd %xmm3, %ymm8 -; AVX512VL-NEXT: vmulpd %ymm0, %ymm8, %ymm0 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] -; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[2,2,2,2] -; AVX512VL-NEXT: vmulpd %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3] -; AVX512VL-NEXT: vmulpd %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm6, %zmm0 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm7, %zmm1 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_mul4x4_f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd %xmm2, %ymm4 +; AVX512-NEXT: vmulpd %ymm4, %ymm0, %ymm4 +; AVX512-NEXT: vextractf64x4 $1, %zmm2, %ymm5 +; AVX512-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX512-NEXT: vmulpd %ymm5, %ymm0, %ymm5 +; AVX512-NEXT: vbroadcastsd %xmm3, %ymm6 +; AVX512-NEXT: vmulpd %ymm6, %ymm0, %ymm6 +; AVX512-NEXT: vextractf64x4 $1, %zmm3, %ymm7 +; AVX512-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX512-NEXT: vmulpd %ymm7, %ymm0, %ymm7 +; AVX512-NEXT: vinsertf64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm2[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm8 = zmm0[4,5,6,7,4,5,6,7] +; AVX512-NEXT: vmulpd %zmm5, %zmm8, %zmm0 +; AVX512-NEXT: vaddpd %zmm0, %zmm4, %zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm4 +; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm2[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vmulpd %zmm5, %zmm4, %zmm5 +; AVX512-NEXT: vaddpd %zmm5, %zmm0, %zmm0 +; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm7, %zmm6, %zmm2 +; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm3[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vmulpd %zmm5, %zmm8, %zmm5 +; AVX512-NEXT: vaddpd %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vmulpd %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vaddpd %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm3[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vmulpd %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: retq entry: %split = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> %split1 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32>