diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2fd744391b917..4487b9d510cc7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25557,8 +25557,31 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations)) return NarrowBOp; - if (SimplifyDemandedVectorElts(SDValue(N, 0))) - return SDValue(N, 0); + // If only EXTRACT_SUBVECTOR nodes use the source vector we can + // simplify it based on the (valid) extractions. + if (!V.getValueType().isScalableVector() && + llvm::all_of(V->users(), [&](SDNode *Use) { + return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR && + Use->getOperand(0) == V; + })) { + unsigned NumElts = V.getValueType().getVectorNumElements(); + APInt DemandedElts = APInt::getZero(NumElts); + for (SDNode *User : V->users()) { + unsigned ExtIdx = User->getConstantOperandVal(1); + unsigned NumSubElts = User->getValueType(0).getVectorNumElements(); + DemandedElts.setBits(ExtIdx, ExtIdx + NumSubElts); + } + if (SimplifyDemandedVectorElts(V, DemandedElts, /*AssumeSingleUse=*/true)) { + // We simplified the vector operand of this extract subvector. If this + // extract is not dead, visit it again so it is folded properly. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); + return SDValue(N, 0); + } + } else { + if (SimplifyDemandedVectorElts(SDValue(N, 0))) + return SDValue(N, 0); + } return SDValue(); } diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 2dfa515d9f05c..8125e062e7ffd 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -2573,7 +2573,6 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2591,7 +2590,6 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2837,7 +2835,6 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2855,7 +2852,6 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3100,7 +3096,6 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3118,7 +3113,6 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3614,10 +3608,9 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3631,10 +3624,9 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3868,10 +3860,9 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3885,10 +3876,9 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll index 6ebe5be45a4f8..a3ff58e3dcf9b 100644 --- a/llvm/test/CodeGen/X86/pr42905.ll +++ b/llvm/test/CodeGen/X86/pr42905.ll @@ -4,16 +4,10 @@ define <4 x double> @autogen_SD30452(i1 %L230) { ; CHECK-LABEL: autogen_SD30452: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [151829,151829] -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; CHECK-NEXT: movq %xmm2, %rax -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: cvtsi2sd %rax, %xmm2 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [151829,151829] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: retq BB: %I = insertelement <4 x i64> zeroinitializer, i64 151829, i32 3 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index c74440d7ec021..fe71a16039c19 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -927,8 +927,7 @@ define dso_local i32 @sad_nonloop_64i8(ptr nocapture readonly %p, i64, ptr nocap ; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX512F-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 40e4bb4b16c79..bc08f57e5faac 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -2079,7 +2079,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpsrld $16, %xmm8, %xmm10 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 ; AVX-NEXT: vandnps %ymm10, %ymm6, %ymm6 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll index 5bd9b0292a8f0..638c195850d32 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -170,7 +170,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] @@ -264,7 +264,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] @@ -306,7 +306,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index eafee9e65345f..f0f430abc48dc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -175,7 +175,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] @@ -311,7 +311,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] @@ -353,7 +353,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll index e8f9c7f7b524d..4d6daf3fb77f0 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -216,7 +216,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] @@ -310,7 +310,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] @@ -352,7 +352,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 0103b7622dc3e..ab95081e2938e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -357,14 +357,14 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-LABEL: test_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -390,14 +390,14 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-LABEL: test_v8i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -667,14 +667,14 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -708,14 +708,14 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index ab216cafcc923..a598e30845579 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -3862,15 +3862,14 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX-NEXT: vbroadcastss (%rdi), %ymm3 +; AVX-NEXT: vbroadcastss (%rdi), %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4116,7 +4115,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vbroadcastss (%rdi), %ymm1 +; AVX-NEXT: vbroadcastss (%rdi), %xmm1 ; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]