From c3081f4bdbde62c6448998cccb3f0ace3b7c418d Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Thu, 8 May 2025 13:52:28 +0200 Subject: [PATCH 1/3] [SelectionDAG] Fix bug related to demanded bits/elts for BITCAST When we have a BITCAST and the source type is a vector with smaller elements compared to the destination type, then we need to demand all the source elements that make up the demanded elts for the result when doing recursive calls to SimplifyDemandedBits, SimplifyDemandedVectorElts and SimplifyMultipleUseDemandedBits. Problem is that those simplifications are allowed to turn non-demanded elements of a vector into POISON, so unless we demand all source elements that make up the result there is a risk that the result would be more poisonous (even for demanded elts) after the simplification. The patch fixes some bugs in SimplifyMultipleUseDemandedBits and SimplifyDemandedBits for situations when we did not consider the problem described above. Now we make sure that we also demand vector elements that "must not be turned into poison" even if those elements correspond to bits that does not need to be defined according to the DemandedBits mask. Fixes #138513 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 22 +- llvm/test/CodeGen/AArch64/reduce-or.ll | 11 +- llvm/test/CodeGen/AArch64/reduce-xor.ll | 11 +- .../AArch64/vecreduce-and-legalization.ll | 12 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 98 +- llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll | 7 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 602 +- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 90 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 896 ++- .../AMDGPU/load-range-metadata-sign-bits.ll | 5 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 13 +- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 60 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 80 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 16 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 80 +- llvm/test/CodeGen/ARM/fpclamptosat_vec.ll | 444 +- .../CodeGen/Thumb2/mve-fpclamptosat_vec.ll | 80 +- .../Thumb2/mve-gather-ind8-unscaled.ll | 5 - .../CodeGen/Thumb2/mve-laneinterleaving.ll | 86 +- llvm/test/CodeGen/Thumb2/mve-pred-ext.ll | 1 + llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll | 213 +- .../Thumb2/mve-scatter-ind8-unscaled.ll | 9 +- .../CodeGen/Thumb2/mve-vecreduce-addpred.ll | 8 +- .../CodeGen/Thumb2/mve-vecreduce-mlapred.ll | 8 +- .../X86/avx512-intrinsics-fast-isel.ll | 4 +- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 46 +- .../X86/avx512vl-intrinsics-upgrade.ll | 52 +- .../CodeGen/X86/avx512vl-vec-masked-cmp.ll | 250 +- .../test/CodeGen/X86/bitcast-and-setcc-128.ll | 28 +- llvm/test/CodeGen/X86/bitcast-setcc-128.ll | 12 +- llvm/test/CodeGen/X86/bitcast-setcc-512.ll | 11 +- llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 15 +- .../CodeGen/X86/buildvec-widen-dotproduct.ll | 71 +- llvm/test/CodeGen/X86/combine-pmuldq.ll | 87 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 51 +- llvm/test/CodeGen/X86/combine-sra.ll | 63 +- llvm/test/CodeGen/X86/combine-udiv.ll | 5 +- .../CodeGen/X86/f16c-intrinsics-fast-isel.ll | 4 + llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 24 +- .../CodeGen/X86/fminimumnum-fmaximumnum.ll | 24 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 89 +- llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 300 +- ...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 48 +- llvm/test/CodeGen/X86/known-never-zero.ll | 3 +- llvm/test/CodeGen/X86/known-pow2.ll | 12 +- llvm/test/CodeGen/X86/known-signbits-shl.ll | 3 +- .../test/CodeGen/X86/known-signbits-vector.ll | 30 +- llvm/test/CodeGen/X86/masked_store.ll | 18 +- llvm/test/CodeGen/X86/movmsk-cmp.ll | 109 +- llvm/test/CodeGen/X86/mulvi32.ll | 15 +- ...of-two-or-zero-when-comparing-with-zero.ll | 25 +- llvm/test/CodeGen/X86/pmul.ll | 140 +- llvm/test/CodeGen/X86/pmulh.ll | 6 +- llvm/test/CodeGen/X86/pr107423.ll | 26 +- llvm/test/CodeGen/X86/pr35918.ll | 4 +- llvm/test/CodeGen/X86/pr41619.ll | 2 + llvm/test/CodeGen/X86/pr42727.ll | 2 +- llvm/test/CodeGen/X86/pr45563-2.ll | 2 +- llvm/test/CodeGen/X86/pr45833.ll | 2 +- llvm/test/CodeGen/X86/pr77459.ll | 2 +- llvm/test/CodeGen/X86/promote-cmp.ll | 41 +- llvm/test/CodeGen/X86/promote-vec3.ll | 9 +- llvm/test/CodeGen/X86/psubus.ll | 255 +- .../test/CodeGen/X86/rotate-extract-vector.ll | 40 +- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 207 +- llvm/test/CodeGen/X86/sat-add.ll | 21 +- llvm/test/CodeGen/X86/sdiv-exact.ll | 30 +- llvm/test/CodeGen/X86/sdiv_fix_sat.ll | 261 +- llvm/test/CodeGen/X86/shrink_vmul.ll | 2 + .../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 20 +- llvm/test/CodeGen/X86/sshl_sat_vec.ll | 134 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 274 +- llvm/test/CodeGen/X86/test-shrink-bug.ll | 2 +- llvm/test/CodeGen/X86/ucmp.ll | 435 +- llvm/test/CodeGen/X86/udiv-exact.ll | 30 +- .../CodeGen/X86/urem-seteq-illegal-types.ll | 8 +- .../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 294 +- llvm/test/CodeGen/X86/ushl_sat_vec.ll | 29 +- .../CodeGen/X86/vec-strict-inttofp-256.ll | 80 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 285 +- llvm/test/CodeGen/X86/vec_minmax_sint.ll | 158 +- llvm/test/CodeGen/X86/vec_minmax_uint.ll | 158 +- llvm/test/CodeGen/X86/vec_smulo.ll | 282 +- llvm/test/CodeGen/X86/vec_umulo.ll | 188 +- .../test/CodeGen/X86/vector-compare-all_of.ll | 36 +- .../test/CodeGen/X86/vector-compare-any_of.ll | 36 +- .../X86/vector-constrained-fp-intrinsics.ll | 42 +- llvm/test/CodeGen/X86/vector-fshl-128.ll | 298 +- llvm/test/CodeGen/X86/vector-fshl-256.ll | 95 +- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 93 +- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 29 +- llvm/test/CodeGen/X86/vector-fshr-128.ll | 60 +- llvm/test/CodeGen/X86/vector-fshr-256.ll | 39 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 61 +- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 4 +- .../vector-interleaved-store-i32-stride-5.ll | 18 +- .../vector-interleaved-store-i32-stride-7.ll | 6567 ++++++++--------- .../vector-interleaved-store-i8-stride-8.ll | 436 +- llvm/test/CodeGen/X86/vector-mul.ll | 68 +- llvm/test/CodeGen/X86/vector-pcmp.ll | 5 +- .../CodeGen/X86/vector-reduce-fmaximum.ll | 163 +- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 168 +- llvm/test/CodeGen/X86/vector-reduce-smax.ll | 168 +- llvm/test/CodeGen/X86/vector-reduce-smin.ll | 161 +- llvm/test/CodeGen/X86/vector-reduce-umax.ll | 168 +- llvm/test/CodeGen/X86/vector-reduce-umin.ll | 161 +- llvm/test/CodeGen/X86/vector-rotate-128.ll | 93 +- llvm/test/CodeGen/X86/vector-rotate-256.ll | 29 +- llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 52 +- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 42 +- .../CodeGen/X86/vector-shift-shl-sub128.ll | 104 +- .../CodeGen/X86/vector-shuffle-256-v16.ll | 32 +- .../X86/vector-shuffle-combining-avx.ll | 12 +- .../X86/vector-shuffle-combining-ssse3.ll | 9 +- .../CodeGen/X86/vector-shuffle-combining.ll | 9 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 1751 +++-- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 1343 ++-- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 766 +- ...vector_splat-const-shift-of-constmasked.ll | 9 +- llvm/test/CodeGen/X86/vselect.ll | 30 +- 120 files changed, 10981 insertions(+), 9861 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index ba34c72156228..d5697b6031537 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -711,18 +711,17 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( unsigned Scale = NumDstEltBits / NumSrcEltBits; unsigned NumSrcElts = SrcVT.getVectorNumElements(); APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); - APInt DemandedSrcElts = APInt::getZero(NumSrcElts); for (unsigned i = 0; i != Scale; ++i) { unsigned EltOffset = IsLE ? i : (Scale - 1 - i); unsigned BitOffset = EltOffset * NumSrcEltBits; APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset); - if (!Sub.isZero()) { + if (!Sub.isZero()) DemandedSrcBits |= Sub; - for (unsigned j = 0; j != NumElts; ++j) - if (DemandedElts[j]) - DemandedSrcElts.setBit((j * Scale) + i); - } } + // Need to demand all smaller source elements that maps to a demanded + // destination element, since recursive calls below may turn not demanded + // elements into poison. + APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); if (SDValue V = SimplifyMultipleUseDemandedBits( Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1)) @@ -2755,18 +2754,17 @@ bool TargetLowering::SimplifyDemandedBits( unsigned Scale = BitWidth / NumSrcEltBits; unsigned NumSrcElts = SrcVT.getVectorNumElements(); APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); - APInt DemandedSrcElts = APInt::getZero(NumSrcElts); for (unsigned i = 0; i != Scale; ++i) { unsigned EltOffset = IsLE ? i : (Scale - 1 - i); unsigned BitOffset = EltOffset * NumSrcEltBits; APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset); - if (!Sub.isZero()) { + if (!Sub.isZero()) DemandedSrcBits |= Sub; - for (unsigned j = 0; j != NumElts; ++j) - if (DemandedElts[j]) - DemandedSrcElts.setBit((j * Scale) + i); - } } + // Need to demand all smaller source elements that maps to a demanded + // destination element, since recursive calls below may turn not demanded + // elements into poison. + APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); APInt KnownSrcUndef, KnownSrcZero; if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef, diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll index aac31ce8b71b7..f5291f5debb40 100644 --- a/llvm/test/CodeGen/AArch64/reduce-or.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or.ll @@ -218,13 +218,12 @@ define i8 @test_redor_v3i8(<3 x i8> %a) { ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov v0.h[0], w0 ; CHECK-NEXT: mov v0.h[1], w1 -; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: mov v0.h[2], w2 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: lsr x10, x9, #32 -; CHECK-NEXT: lsr x9, x9, #16 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: lsr x10, x8, #16 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w0, w8, w10 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v3i8: diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll index 9a00172f94763..df8485b91468f 100644 --- a/llvm/test/CodeGen/AArch64/reduce-xor.ll +++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll @@ -207,13 +207,12 @@ define i8 @test_redxor_v3i8(<3 x i8> %a) { ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov v0.h[0], w0 ; CHECK-NEXT: mov v0.h[1], w1 -; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: mov v0.h[2], w2 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: lsr x10, x9, #32 -; CHECK-NEXT: lsr x9, x9, #16 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: lsr x10, x8, #16 +; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: eor w0, w8, w10 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v3i8: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll index 7fa416e0dbcd5..e21ae88d52b47 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -125,12 +125,14 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { define i32 @test_v3i32(<3 x i32> %a) nounwind { ; CHECK-LABEL: test_v3i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov w8, #-1 // =0xffffffff +; CHECK-NEXT: mov v1.s[3], w8 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: and v1.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a) ret i32 %b diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index d1090738e24a6..07fc5f30f23a3 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1904,69 +1904,74 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: flat_load_ubyte v8, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[0:1] +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v9, v[0:1] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: flat_load_ubyte v3, v[10:11] +; VI-NEXT: flat_load_ubyte v6, v[12:13] +; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v8, 0x3020504 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_e32 v4, v9, v4 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; VI-NEXT: v_perm_b32 v4, v4, s0, v8 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 -; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: @@ -1984,8 +1989,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 +; GFX9-NEXT: v_cvt_f32_ubyte1_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_ubyte0_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2001,34 +2006,33 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 +; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 ; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 -; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1 -; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4 +; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:1 +; GFX11-NEXT: global_load_d16_b16 v4, v0, s[2:3] offset:4 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b96 v7, v[4:6], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index 933c6506d0270..c29039b86e82b 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -150,9 +150,10 @@ define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) { ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v0 -; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[3:4] -; CHECK-NEXT: flat_store_dword v[2:3], v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0 +; CHECK-NEXT: v_mul_hi_u32 v6, v1, v0 +; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[4:5] +; CHECK-NEXT: flat_store_dword v[2:3], v6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index a9240eff8e691..6b7f648f65a45 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -8340,191 +8340,216 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_mov_b32 s13, s7 +; GFX6-NEXT: s_mov_b32 s17, s7 +; GFX6-NEXT: s_mov_b32 s19, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s5, 28 -; GFX6-NEXT: s_lshr_b32 s38, s5, 29 -; GFX6-NEXT: s_lshr_b32 s30, s5, 26 -; GFX6-NEXT: s_lshr_b32 s34, s5, 27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 24 -; GFX6-NEXT: s_lshr_b32 s28, s5, 25 -; GFX6-NEXT: s_lshr_b32 s22, s5, 22 -; GFX6-NEXT: s_lshr_b32 s24, s5, 23 -; GFX6-NEXT: s_lshr_b32 s18, s5, 20 -; GFX6-NEXT: s_lshr_b32 s20, s5, 21 -; GFX6-NEXT: s_lshr_b32 s14, s5, 18 -; GFX6-NEXT: s_lshr_b32 s16, s5, 19 -; GFX6-NEXT: s_lshr_b32 s10, s5, 16 -; GFX6-NEXT: s_lshr_b32 s12, s5, 17 -; GFX6-NEXT: s_lshr_b32 s6, s5, 14 -; GFX6-NEXT: s_lshr_b32 s8, s5, 15 -; GFX6-NEXT: s_mov_b32 s40, s5 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s5, 12 +; GFX6-NEXT: s_lshr_b32 s6, s5, 30 +; GFX6-NEXT: s_lshr_b32 s8, s5, 28 +; GFX6-NEXT: s_lshr_b32 s10, s5, 29 +; GFX6-NEXT: s_lshr_b32 s12, s5, 26 +; GFX6-NEXT: s_lshr_b32 s16, s5, 27 +; GFX6-NEXT: s_mov_b32 s18, s5 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[18:19], 0x10000 +; GFX6-NEXT: s_ashr_i32 s18, s5, 31 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x10000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s31, s7 +; GFX6-NEXT: s_mov_b32 s35, s7 +; GFX6-NEXT: s_mov_b32 s25, s7 +; GFX6-NEXT: s_mov_b32 s27, s7 +; GFX6-NEXT: s_mov_b32 s21, s7 +; GFX6-NEXT: s_mov_b32 s23, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s44 -; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 13 +; GFX6-NEXT: s_mov_b32 s45, s7 +; GFX6-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NEXT: v_mov_b32_e32 v7, s15 +; GFX6-NEXT: s_mov_b32 s47, s7 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 10 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 11 +; GFX6-NEXT: s_mov_b32 s43, s7 +; GFX6-NEXT: v_mov_b32_e32 v8, s40 +; GFX6-NEXT: v_mov_b32_e32 v9, s41 +; GFX6-NEXT: s_mov_b32 s41, s7 ; GFX6-NEXT: v_mov_b32_e32 v10, s38 ; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 8 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_mov_b32 s39, s7 +; GFX6-NEXT: v_mov_b32_e32 v12, s36 +; GFX6-NEXT: v_mov_b32_e32 v13, s37 +; GFX6-NEXT: s_mov_b32 s15, s7 +; GFX6-NEXT: v_mov_b32_e32 v14, s28 +; GFX6-NEXT: v_mov_b32_e32 v15, s29 +; GFX6-NEXT: s_mov_b32 s37, s7 +; GFX6-NEXT: s_lshr_b32 s30, s5, 24 +; GFX6-NEXT: s_lshr_b32 s34, s5, 25 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 9 -; GFX6-NEXT: v_mov_b32_e32 v14, s34 -; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 6 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[30:31], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v5, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 7 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: s_mov_b32 s29, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NEXT: s_lshr_b32 s24, s5, 22 +; GFX6-NEXT: s_lshr_b32 s26, s5, 23 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s22 -; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 2 +; GFX6-NEXT: v_mov_b32_e32 v8, s24 +; GFX6-NEXT: v_mov_b32_e32 v9, s25 +; GFX6-NEXT: s_mov_b32 s25, s7 +; GFX6-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NEXT: s_mov_b32 s27, s7 +; GFX6-NEXT: s_lshr_b32 s20, s5, 20 +; GFX6-NEXT: s_lshr_b32 s22, s5, 21 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v12, s20 +; GFX6-NEXT: v_mov_b32_e32 v13, s21 +; GFX6-NEXT: s_mov_b32 s35, s7 +; GFX6-NEXT: v_mov_b32_e32 v14, s22 +; GFX6-NEXT: v_mov_b32_e32 v15, s23 +; GFX6-NEXT: s_mov_b32 s21, s7 +; GFX6-NEXT: s_mov_b32 s23, s7 +; GFX6-NEXT: s_lshr_b32 s16, s5, 18 +; GFX6-NEXT: s_lshr_b32 s18, s5, 19 +; GFX6-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NEXT: s_lshr_b32 s12, s5, 17 +; GFX6-NEXT: s_lshr_b32 s8, s5, 14 +; GFX6-NEXT: s_lshr_b32 s44, s5, 15 +; GFX6-NEXT: s_lshr_b32 s46, s5, 12 +; GFX6-NEXT: s_lshr_b32 s42, s5, 13 +; GFX6-NEXT: s_lshr_b32 s40, s5, 10 +; GFX6-NEXT: s_lshr_b32 s38, s5, 11 +; GFX6-NEXT: s_lshr_b32 s14, s5, 8 +; GFX6-NEXT: s_lshr_b32 s36, s5, 9 +; GFX6-NEXT: s_lshr_b32 s28, s5, 6 +; GFX6-NEXT: s_lshr_b32 s30, s5, 7 +; GFX6-NEXT: s_lshr_b32 s24, s5, 4 +; GFX6-NEXT: s_lshr_b32 s26, s5, 5 +; GFX6-NEXT: s_lshr_b32 s34, s5, 2 +; GFX6-NEXT: s_lshr_b32 s20, s5, 3 +; GFX6-NEXT: s_lshr_b32 s22, s5, 1 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 30 -; GFX6-NEXT: v_mov_b32_e32 v4, s16 -; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 31 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 30 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_lshr_b32 s18, s4, 31 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s10 ; GFX6-NEXT: v_mov_b32_e32 v9, s11 ; GFX6-NEXT: s_lshr_b32 s10, s4, 28 -; GFX6-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v11, s7 ; GFX6-NEXT: s_lshr_b32 s12, s4, 29 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s46, s4, 26 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NEXT: v_mov_b32_e32 v13, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 26 +; GFX6-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NEXT: s_lshr_b32 s44, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[46:47], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 24 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: s_lshr_b32 s44, s4, 25 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 +; GFX6-NEXT: s_lshr_b32 s46, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 22 +; GFX6-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 22 ; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s42, s4, 23 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 +; GFX6-NEXT: s_lshr_b32 s40, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v12, s14 +; GFX6-NEXT: v_mov_b32_e32 v13, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 20 ; GFX6-NEXT: v_mov_b32_e32 v14, s6 ; GFX6-NEXT: v_mov_b32_e32 v15, s7 ; GFX6-NEXT: s_lshr_b32 s6, s4, 21 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v18, s26 -; GFX6-NEXT: v_mov_b32_e32 v19, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v16, s28 +; GFX6-NEXT: v_mov_b32_e32 v17, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v18, s30 +; GFX6-NEXT: v_mov_b32_e32 v19, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s28 -; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s24 +; GFX6-NEXT: v_mov_b32_e32 v9, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 16 +; GFX6-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 17 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 14 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s34 +; GFX6-NEXT: v_mov_b32_e32 v13, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 14 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s18 -; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 12 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 12 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s14 -; GFX6-NEXT: v_mov_b32_e32 v17, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v18, s16 -; GFX6-NEXT: v_mov_b32_e32 v19, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v16, s16 +; GFX6-NEXT: v_mov_b32_e32 v17, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v18, s18 +; GFX6-NEXT: v_mov_b32_e32 v19, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 @@ -8535,62 +8560,62 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v10, s12 ; GFX6-NEXT: v_mov_b32_e32 v11, s13 ; GFX6-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s38 -; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NEXT: v_mov_b32_e32 v13, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s36 +; GFX6-NEXT: v_mov_b32_e32 v15, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 7 ; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: v_mov_b32_e32 v3, s45 ; GFX6-NEXT: s_lshr_b32 s44, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s36 -; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v18, s42 -; GFX6-NEXT: v_mov_b32_e32 v19, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s38 +; GFX6-NEXT: v_mov_b32_e32 v17, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v18, s40 +; GFX6-NEXT: v_mov_b32_e32 v19, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s30 -; GFX6-NEXT: v_mov_b32_e32 v9, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v8, s14 +; GFX6-NEXT: v_mov_b32_e32 v9, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 @@ -8599,58 +8624,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v11, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(2) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s28 ; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: v_mov_b32_e32 v8, s4 ; GFX6-NEXT: v_mov_b32_e32 v9, s5 @@ -8661,8 +8686,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX8-NEXT: s_mov_b32 s69, 0 +; GFX8-NEXT: s_mov_b32 s67, s69 +; GFX8-NEXT: s_mov_b32 s41, s69 +; GFX8-NEXT: s_mov_b32 s61, s69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; GFX8-NEXT: s_mov_b32 s43, s69 +; GFX8-NEXT: s_mov_b32 s65, s69 +; GFX8-NEXT: s_mov_b32 s45, s69 +; GFX8-NEXT: s_mov_b32 s57, s69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s0, s3, 8 ; GFX8-NEXT: s_lshr_b32 s48, s3, 15 @@ -8675,14 +8708,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s36, s3, 27 ; GFX8-NEXT: s_lshr_b32 s68, s3, 24 ; GFX8-NEXT: s_lshr_b32 s38, s3, 25 -; GFX8-NEXT: s_lshr_b32 s64, s3, 22 +; GFX8-NEXT: s_lshr_b32 s66, s3, 22 ; GFX8-NEXT: s_lshr_b32 s40, s3, 23 ; GFX8-NEXT: s_lshr_b32 s60, s3, 20 ; GFX8-NEXT: s_lshr_b32 s42, s3, 21 -; GFX8-NEXT: s_lshr_b32 s66, s3, 18 +; GFX8-NEXT: s_lshr_b32 s64, s3, 18 ; GFX8-NEXT: s_lshr_b32 s44, s3, 19 ; GFX8-NEXT: s_lshr_b32 s56, s3, 16 ; GFX8-NEXT: s_lshr_b32 s46, s3, 17 +; GFX8-NEXT: s_mov_b32 s47, s69 ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 ; GFX8-NEXT: s_lshr_b32 s62, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 @@ -8691,13 +8725,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 ; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: s_mov_b32 s23, s69 ; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -8729,9 +8764,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v12, s72 ; GFX8-NEXT: v_mov_b32_e32 v0, s70 ; GFX8-NEXT: v_mov_b32_e32 v8, s68 -; GFX8-NEXT: v_mov_b32_e32 v16, s64 +; GFX8-NEXT: v_mov_b32_e32 v16, s66 ; GFX8-NEXT: v_mov_b32_e32 v20, s60 -; GFX8-NEXT: v_mov_b32_e32 v24, s66 +; GFX8-NEXT: v_mov_b32_e32 v24, s64 ; GFX8-NEXT: v_mov_b32_e32 v28, s56 ; GFX8-NEXT: v_mov_b32_e32 v32, s58 ; GFX8-NEXT: v_mov_b32_e32 v36, s62 @@ -8794,11 +8829,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NEXT: v_mov_b32_e32 v9, s69 ; GFX8-NEXT: v_mov_b32_e32 v11, s39 -; GFX8-NEXT: v_mov_b32_e32 v17, s65 +; GFX8-NEXT: v_mov_b32_e32 v17, s67 ; GFX8-NEXT: v_mov_b32_e32 v19, s41 ; GFX8-NEXT: v_mov_b32_e32 v21, s61 ; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_mov_b32_e32 v25, s67 +; GFX8-NEXT: v_mov_b32_e32 v25, s65 ; GFX8-NEXT: v_mov_b32_e32 v27, s45 ; GFX8-NEXT: v_mov_b32_e32 v29, s57 ; GFX8-NEXT: v_mov_b32_e32 v31, s47 @@ -9462,48 +9497,59 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s67, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s57, s67 +; GFX12-NEXT: s_mov_b32 s63, s67 +; GFX12-NEXT: s_mov_b32 s45, s67 +; GFX12-NEXT: s_mov_b32 s53, s67 +; GFX12-NEXT: s_mov_b32 s31, s67 +; GFX12-NEXT: s_mov_b32 s41, s67 +; GFX12-NEXT: s_mov_b32 s19, s67 +; GFX12-NEXT: s_mov_b32 s27, s67 +; GFX12-NEXT: s_mov_b32 s47, s67 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s96, s11, 30 -; GFX12-NEXT: s_lshr_b32 s98, s11, 31 -; GFX12-NEXT: s_lshr_b32 s92, s11, 28 -; GFX12-NEXT: s_lshr_b32 s94, s11, 29 -; GFX12-NEXT: s_lshr_b32 s78, s11, 26 -; GFX12-NEXT: s_lshr_b32 s88, s11, 27 +; GFX12-NEXT: s_lshr_b32 s96, s3, 30 +; GFX12-NEXT: s_lshr_b32 s98, s3, 31 +; GFX12-NEXT: s_lshr_b32 s92, s3, 28 +; GFX12-NEXT: s_lshr_b32 s94, s3, 29 +; GFX12-NEXT: s_lshr_b32 s78, s3, 26 +; GFX12-NEXT: s_lshr_b32 s88, s3, 27 ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX12-NEXT: s_lshr_b32 s66, s11, 24 -; GFX12-NEXT: s_lshr_b32 s74, s11, 25 +; GFX12-NEXT: s_lshr_b32 s66, s3, 24 +; GFX12-NEXT: s_lshr_b32 s74, s3, 25 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 -; GFX12-NEXT: s_lshr_b32 s56, s11, 22 -; GFX12-NEXT: s_lshr_b32 s62, s11, 23 +; GFX12-NEXT: s_lshr_b32 s56, s3, 22 +; GFX12-NEXT: s_lshr_b32 s62, s3, 23 ; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100 ; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92 ; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX12-NEXT: s_lshr_b32 s44, s11, 20 -; GFX12-NEXT: s_lshr_b32 s52, s11, 21 -; GFX12-NEXT: s_lshr_b32 s30, s11, 18 -; GFX12-NEXT: s_lshr_b32 s40, s11, 19 -; GFX12-NEXT: s_lshr_b32 s18, s11, 16 -; GFX12-NEXT: s_lshr_b32 s26, s11, 17 -; GFX12-NEXT: s_lshr_b32 s2, s11, 14 -; GFX12-NEXT: s_lshr_b32 s4, s11, 15 +; GFX12-NEXT: s_lshr_b32 s44, s3, 20 +; GFX12-NEXT: s_lshr_b32 s52, s3, 21 +; GFX12-NEXT: s_lshr_b32 s30, s3, 18 +; GFX12-NEXT: s_lshr_b32 s40, s3, 19 +; GFX12-NEXT: s_lshr_b32 s18, s3, 16 +; GFX12-NEXT: s_lshr_b32 s26, s3, 17 +; GFX12-NEXT: s_lshr_b32 s4, s3, 14 +; GFX12-NEXT: s_lshr_b32 s6, s3, 15 ; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 ; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX12-NEXT: s_lshr_b32 s6, s11, 12 -; GFX12-NEXT: s_lshr_b32 s8, s11, 13 +; GFX12-NEXT: s_lshr_b32 s8, s3, 12 +; GFX12-NEXT: s_lshr_b32 s10, s3, 13 ; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 ; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX12-NEXT: s_lshr_b32 s12, s11, 10 -; GFX12-NEXT: s_lshr_b32 s14, s11, 11 +; GFX12-NEXT: s_lshr_b32 s12, s3, 10 +; GFX12-NEXT: s_lshr_b32 s14, s3, 11 ; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 ; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 @@ -9512,16 +9558,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: s_lshr_b32 s16, s11, 8 -; GFX12-NEXT: s_lshr_b32 s20, s11, 9 +; GFX12-NEXT: s_lshr_b32 s16, s3, 8 +; GFX12-NEXT: s_lshr_b32 s20, s3, 9 ; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 ; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: s_lshr_b32 s22, s11, 6 -; GFX12-NEXT: s_lshr_b32 s24, s11, 7 +; GFX12-NEXT: s_lshr_b32 s22, s3, 6 +; GFX12-NEXT: s_lshr_b32 s24, s3, 7 ; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52 ; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30 ; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40 @@ -9539,39 +9585,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:416 ; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:400 ; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:384 -; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: s_lshr_b32 s28, s11, 4 -; GFX12-NEXT: s_lshr_b32 s34, s11, 5 -; GFX12-NEXT: s_lshr_b32 s36, s11, 2 -; GFX12-NEXT: s_lshr_b32 s38, s11, 3 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX12-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 +; GFX12-NEXT: v_mov_b32_e32 v5, s8 +; GFX12-NEXT: s_lshr_b32 s28, s3, 4 +; GFX12-NEXT: s_lshr_b32 s34, s3, 5 +; GFX12-NEXT: s_lshr_b32 s36, s3, 2 +; GFX12-NEXT: s_lshr_b32 s38, s3, 3 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 -; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12 -; GFX12-NEXT: s_lshr_b32 s42, s11, 1 -; GFX12-NEXT: s_mov_b32 s46, s11 +; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX12-NEXT: s_lshr_b32 s42, s3, 1 +; GFX12-NEXT: s_mov_b32 s46, s3 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 ; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 -; GFX12-NEXT: s_lshr_b32 s48, s10, 30 -; GFX12-NEXT: s_lshr_b32 s50, s10, 31 +; GFX12-NEXT: s_lshr_b32 s48, s2, 30 +; GFX12-NEXT: s_lshr_b32 s50, s2, 31 ; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20 ; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22 -; GFX12-NEXT: s_lshr_b32 s54, s10, 28 -; GFX12-NEXT: s_lshr_b32 s58, s10, 29 +; GFX12-NEXT: s_lshr_b32 s54, s2, 28 +; GFX12-NEXT: s_lshr_b32 s58, s2, 29 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24 ; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28 -; GFX12-NEXT: s_lshr_b32 s60, s10, 26 -; GFX12-NEXT: s_lshr_b32 s64, s10, 27 +; GFX12-NEXT: s_lshr_b32 s60, s2, 26 +; GFX12-NEXT: s_lshr_b32 s64, s2, 27 ; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34 @@ -9586,43 +9632,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37 ; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39 ; GFX12-NEXT: v_mov_b32_e32 v5, s46 -; GFX12-NEXT: s_lshr_b32 s68, s10, 24 -; GFX12-NEXT: s_lshr_b32 s70, s10, 25 -; GFX12-NEXT: s_lshr_b32 s72, s10, 22 -; GFX12-NEXT: s_lshr_b32 s76, s10, 23 +; GFX12-NEXT: s_lshr_b32 s68, s2, 24 +; GFX12-NEXT: s_lshr_b32 s70, s2, 25 +; GFX12-NEXT: s_lshr_b32 s72, s2, 22 +; GFX12-NEXT: s_lshr_b32 s76, s2, 23 ; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42 ; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48 -; GFX12-NEXT: s_lshr_b32 s80, s10, 20 -; GFX12-NEXT: s_lshr_b32 s82, s10, 21 +; GFX12-NEXT: s_lshr_b32 s80, s2, 20 +; GFX12-NEXT: s_lshr_b32 s82, s2, 21 ; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50 ; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54 -; GFX12-NEXT: s_lshr_b32 s84, s10, 18 -; GFX12-NEXT: s_lshr_b32 s86, s10, 19 +; GFX12-NEXT: s_lshr_b32 s84, s2, 18 +; GFX12-NEXT: s_lshr_b32 s86, s2, 19 ; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58 ; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60 -; GFX12-NEXT: s_lshr_b32 s90, s10, 16 -; GFX12-NEXT: s_lshr_b32 s98, s10, 17 +; GFX12-NEXT: s_lshr_b32 s90, s2, 16 +; GFX12-NEXT: s_lshr_b32 s98, s2, 17 ; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64 ; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68 -; GFX12-NEXT: s_lshr_b32 s96, s10, 14 -; GFX12-NEXT: s_lshr_b32 s100, s10, 15 -; GFX12-NEXT: s_lshr_b32 s94, s10, 13 -; GFX12-NEXT: s_lshr_b32 s88, s10, 11 -; GFX12-NEXT: s_lshr_b32 s74, s10, 9 -; GFX12-NEXT: s_lshr_b32 s62, s10, 7 -; GFX12-NEXT: s_lshr_b32 s52, s10, 5 -; GFX12-NEXT: s_lshr_b32 s40, s10, 3 -; GFX12-NEXT: s_lshr_b32 s26, s10, 1 +; GFX12-NEXT: s_lshr_b32 s96, s2, 14 +; GFX12-NEXT: s_lshr_b32 s100, s2, 15 +; GFX12-NEXT: s_lshr_b32 s94, s2, 13 +; GFX12-NEXT: s_lshr_b32 s88, s2, 11 +; GFX12-NEXT: s_lshr_b32 s74, s2, 9 +; GFX12-NEXT: s_lshr_b32 s62, s2, 7 +; GFX12-NEXT: s_lshr_b32 s52, s2, 5 +; GFX12-NEXT: s_lshr_b32 s40, s2, 3 +; GFX12-NEXT: s_lshr_b32 s26, s2, 1 ; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70 @@ -9637,19 +9683,19 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73 ; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77 ; GFX12-NEXT: v_mov_b32_e32 v5, s80 -; GFX12-NEXT: s_lshr_b32 s92, s10, 12 -; GFX12-NEXT: s_lshr_b32 s78, s10, 10 +; GFX12-NEXT: s_lshr_b32 s92, s2, 12 +; GFX12-NEXT: s_lshr_b32 s78, s2, 10 ; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82 ; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84 -; GFX12-NEXT: s_lshr_b32 s66, s10, 8 -; GFX12-NEXT: s_lshr_b32 s56, s10, 6 -; GFX12-NEXT: s_lshr_b32 s44, s10, 4 -; GFX12-NEXT: s_lshr_b32 s30, s10, 2 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 +; GFX12-NEXT: s_lshr_b32 s66, s2, 8 +; GFX12-NEXT: s_lshr_b32 s56, s2, 6 +; GFX12-NEXT: s_lshr_b32 s44, s2, 4 +; GFX12-NEXT: s_lshr_b32 s30, s2, 2 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 @@ -9693,8 +9739,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 ; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26 ; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18 -; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10 -; GFX12-NEXT: v_mov_b32_e32 v24, s11 +; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s2 +; GFX12-NEXT: v_mov_b32_e32 v24, s3 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 817c5def5614f..606568d8b149a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6209,6 +6209,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 16 @@ -6233,6 +6234,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 s5, 0 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6571,7 +6573,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 @@ -6617,6 +6621,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s3, 0 +; GCN-HSA-NEXT: s_mov_b32 s9, s3 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s2, s7 ; GCN-HSA-NEXT: s_mov_b32 s8, s5 @@ -7189,7 +7195,11 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s11 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s9 @@ -7267,8 +7277,12 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 s7, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s11, s7 +; GCN-HSA-NEXT: s_mov_b32 s21, s7 +; GCN-HSA-NEXT: s_mov_b32 s23, s7 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s6, s19 ; GCN-HSA-NEXT: s_mov_b32 s10, s17 @@ -8327,20 +8341,21 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s69, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s69 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s68, s15 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s11 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[70:71], s[18:19], 0x100000 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s4, 16 @@ -8350,7 +8365,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[14:15], 0x100000 @@ -8361,15 +8376,22 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[68:69], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s71 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s51, s69 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s53, s69 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s57, s69 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s55, s69 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s43, s69 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s45, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s71 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -8405,7 +8427,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 @@ -8429,16 +8451,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -8472,10 +8494,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 s43, 0 +; GCN-HSA-NEXT: s_mov_b32 s49, s43 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-HSA-NEXT: s_mov_b32 s51, s43 +; GCN-HSA-NEXT: s_mov_b32 s53, s43 +; GCN-HSA-NEXT: s_mov_b32 s55, s43 +; GCN-HSA-NEXT: s_mov_b32 s57, s43 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s40, s15 +; GCN-HSA-NEXT: s_mov_b32 s42, s15 ; GCN-HSA-NEXT: s_mov_b32 s48, s13 ; GCN-HSA-NEXT: s_mov_b32 s50, s11 ; GCN-HSA-NEXT: s_mov_b32 s52, s9 @@ -8496,14 +8524,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[40:41], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s45, s43 +; GCN-HSA-NEXT: s_mov_b32 s59, s43 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 @@ -8520,7 +8550,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 @@ -8544,14 +8574,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43 -; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 +; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s41 +; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 @@ -8567,12 +8597,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] @@ -8596,10 +8626,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 3b0f8523e1b52..10d7541d8722d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6395,7 +6395,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: s_mov_b32 s9, s7 +; GFX6-NOHSA-NEXT: s_mov_b32 s11, s7 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8 @@ -6438,10 +6441,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 s5, 0 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_mov_b32 s7, s5 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 s9, s5 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8 @@ -6496,12 +6502,14 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s5 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s6, s3 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 @@ -6509,8 +6517,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 @@ -6527,10 +6535,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6608,29 +6616,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s5 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 -; GFX12-NEXT: s_lshr_b32 s6, s3, 8 -; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_lshr_b32 s8, s3, 8 +; GFX12-NEXT: s_mov_b32 s6, s3 ; GFX12-NEXT: s_lshr_b32 s10, s2, 16 ; GFX12-NEXT: s_lshr_b32 s12, s2, 24 ; GFX12-NEXT: s_lshr_b32 s14, s2, 8 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 -; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 ; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s15 ; GFX12-NEXT: v_mov_b32_e32 v6, s14 @@ -7027,17 +7038,23 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s13, 0 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: s_mov_b32 s15, s13 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s13 +; GFX6-NOHSA-NEXT: s_mov_b32 s17, s13 +; GFX6-NOHSA-NEXT: s_mov_b32 s19, s13 +; GFX6-NOHSA-NEXT: s_mov_b32 s21, s13 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s11, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s4, s11 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s10, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s10, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s9, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s9, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s9 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s10, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s10, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s10, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s9, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s9, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s20, s9 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s8, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s8, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s8, 8 @@ -7045,16 +7062,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[8:9], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 @@ -7065,8 +7082,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 @@ -7076,21 +7093,21 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s17 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 @@ -7107,35 +7124,42 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 s5, 0 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 s13, s5 +; GFX7-HSA-NEXT: s_mov_b32 s15, s5 +; GFX7-HSA-NEXT: s_mov_b32 s23, s5 +; GFX7-HSA-NEXT: s_mov_b32 s25, s5 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s12, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s24, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[6:7], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s11, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s11, 8 +; GFX7-HSA-NEXT: s_mov_b32 s14, s11 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s10, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s10, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s9, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s9, 8 +; GFX7-HSA-NEXT: s_mov_b32 s26, s9 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s8, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s8, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s8, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[6:7], s[8:9], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 +; GFX7-HSA-NEXT: s_mov_b32 s27, s5 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -7143,65 +7167,65 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 +; GFX7-HSA-NEXT: s_add_u32 s30, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s31, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-HSA-NEXT: s_add_u32 s12, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 32 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7216,12 +7240,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NOHSA-NEXT: s_mov_b32 s11, 0 +; GFX8-NOHSA-NEXT: s_mov_b32 s13, s11 +; GFX8-NOHSA-NEXT: s_mov_b32 s23, s11 +; GFX8-NOHSA-NEXT: s_mov_b32 s27, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s12, s7 ; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 @@ -7244,8 +7272,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 @@ -7259,10 +7287,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7424,62 +7452,68 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s11, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s2, s7, 16 -; GFX12-NEXT: s_lshr_b32 s8, s7, 8 -; GFX12-NEXT: s_mov_b32 s10, s7 -; GFX12-NEXT: s_lshr_b32 s12, s6, 16 -; GFX12-NEXT: s_lshr_b32 s14, s6, 24 -; GFX12-NEXT: s_lshr_b32 s16, s6, 8 +; GFX12-NEXT: s_lshr_b32 s14, s7, 8 +; GFX12-NEXT: s_mov_b32 s8, s7 +; GFX12-NEXT: s_lshr_b32 s16, s6, 16 +; GFX12-NEXT: s_lshr_b32 s18, s6, 24 +; GFX12-NEXT: s_lshr_b32 s20, s6, 8 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: s_lshr_b32 s18, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v26, 0 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s35 ; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v9, s11 -; GFX12-NEXT: s_lshr_b32 s20, s5, 8 -; GFX12-NEXT: s_mov_b32 s22, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s9 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s13 +; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v9, s15 +; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17 +; GFX12-NEXT: s_lshr_b32 s10, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: s_lshr_b32 s22, s5, 8 +; GFX12-NEXT: s_mov_b32 s12, s5 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX12-NEXT: s_lshr_b32 s24, s4, 16 ; GFX12-NEXT: s_lshr_b32 s26, s4, 24 ; GFX12-NEXT: s_lshr_b32 s28, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 -; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v13, s19 +; GFX12-NEXT: v_mov_b32_e32 v12, s18 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v17, s19 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v26, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v26, v[6:9], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v7, s21 :: v_dual_mov_b32 v6, s20 +; GFX12-NEXT: v_mov_b32_e32 v1, s11 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s5 -; GFX12-NEXT: v_mov_b32_e32 v18, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v15, s13 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v0, s22 -; GFX12-NEXT: v_dual_mov_b32 v3, s21 :: v_dual_mov_b32 v2, s20 -; GFX12-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24 -; GFX12-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v10, s26 -; GFX12-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v20, s30 -; GFX12-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v22, s28 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s23 +; GFX12-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v19, s25 +; GFX12-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v21, s27 +; GFX12-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v23, s31 +; GFX12-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v25, s29 +; GFX12-NEXT: v_mov_b32_e32 v24, s28 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] +; GFX12-NEXT: global_store_b128 v26, v[10:13], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v26, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v26, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v26, v[14:17], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v26, v[18:21], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v26, v[22:25], s[0:1] ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> @@ -8190,37 +8224,39 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s47, 0 +; GFX6-NOHSA-NEXT: s_mov_b32 s11, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s31, s47 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s38, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s42, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 16 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s30, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s54, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 +; GFX6-NOHSA-NEXT: s_mov_b32 s58, s1 ; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s0, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[38:39], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[30:31], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[60:61], s[2:3], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 @@ -8228,143 +8264,162 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s67 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s71 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s63 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s15, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s17, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s21, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s35, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s37, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s55, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s45, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s49, s47 +; GFX6-NOHSA-NEXT: s_mov_b32 s59, s47 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[18:19], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[52:53], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s27 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s65 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s63 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s69 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 s15, 0 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 s47, s15 +; GFX7-HSA-NEXT: s_mov_b32 s49, s15 +; GFX7-HSA-NEXT: s_mov_b32 s59, s15 +; GFX7-HSA-NEXT: s_mov_b32 s61, s15 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s52, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s46, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s48, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 ; GFX7-HSA-NEXT: s_mov_b32 s62, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s20, s1, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 @@ -8374,13 +8429,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[18:19], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[44:45], s[4:5], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 +; GFX7-HSA-NEXT: s_mov_b32 s63, s15 +; GFX7-HSA-NEXT: s_mov_b32 s37, s15 +; GFX7-HSA-NEXT: s_mov_b32 s31, s15 +; GFX7-HSA-NEXT: s_mov_b32 s35, s15 +; GFX7-HSA-NEXT: s_mov_b32 s21, s15 +; GFX7-HSA-NEXT: s_mov_b32 s65, s15 +; GFX7-HSA-NEXT: s_mov_b32 s17, s15 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 @@ -8392,75 +8454,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s44 -; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s46 +; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s47 +; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s40 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s41 +; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s41 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45 -; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s41 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s46 +; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s47 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29 -; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s25 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18 ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70 @@ -8472,17 +8532,19 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s36 @@ -8497,10 +8559,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8553,12 +8615,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NOHSA-NEXT: s_mov_b32 s47, 0 +; GFX8-NOHSA-NEXT: s_mov_b32 s49, s47 +; GFX8-NOHSA-NEXT: s_mov_b32 s59, s47 +; GFX8-NOHSA-NEXT: s_mov_b32 s63, s47 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_mov_b32 s37, s47 +; GFX8-NOHSA-NEXT: s_mov_b32 s29, s47 +; GFX8-NOHSA-NEXT: s_mov_b32 s19, s47 +; GFX8-NOHSA-NEXT: s_mov_b32 s65, s47 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s50, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s48, s7 ; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s6, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s6, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 8 @@ -8609,8 +8679,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 ; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xf0 @@ -8624,10 +8694,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 ; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8957,86 +9027,94 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s39, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s41, s39 +; GFX12-NEXT: s_mov_b32 s25, s39 +; GFX12-NEXT: s_mov_b32 s43, s39 +; GFX12-NEXT: s_mov_b32 s45, s39 +; GFX12-NEXT: s_mov_b32 s35, s39 +; GFX12-NEXT: s_mov_b32 s19, s39 +; GFX12-NEXT: s_mov_b32 s15, s39 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s36, s7, 16 -; GFX12-NEXT: s_lshr_b32 s38, s7, 8 +; GFX12-NEXT: s_lshr_b32 s38, s7, 16 +; GFX12-NEXT: s_lshr_b32 s46, s7, 8 ; GFX12-NEXT: s_mov_b32 s40, s7 -; GFX12-NEXT: s_lshr_b32 s42, s6, 16 -; GFX12-NEXT: s_lshr_b32 s44, s6, 24 +; GFX12-NEXT: s_lshr_b32 s48, s6, 16 +; GFX12-NEXT: s_lshr_b32 s50, s6, 24 ; GFX12-NEXT: s_ashr_i64 s[74:75], s[6:7], 56 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: s_lshr_b32 s46, s6, 8 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75 +; GFX12-NEXT: s_lshr_b32 s52, s6, 8 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s39 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s38 :: v_dual_mov_b32 v3, s75 ; GFX12-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41 -; GFX12-NEXT: s_lshr_b32 s48, s5, 16 +; GFX12-NEXT: s_lshr_b32 s24, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s39 -; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s43 -; GFX12-NEXT: s_lshr_b32 s50, s5, 8 -; GFX12-NEXT: s_mov_b32 s52, s5 -; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 -; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s73 -; GFX12-NEXT: s_lshr_b32 s54, s4, 16 -; GFX12-NEXT: s_lshr_b32 s56, s4, 24 -; GFX12-NEXT: s_ashr_i64 s[70:71], s[4:5], 56 -; GFX12-NEXT: v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s47 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v14, s46 -; GFX12-NEXT: s_lshr_b32 s58, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX12-NEXT: s_lshr_b32 s60, s3, 16 -; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s47 +; GFX12-NEXT: v_dual_mov_b32 v6, s46 :: v_dual_mov_b32 v9, s49 +; GFX12-NEXT: s_lshr_b32 s54, s5, 8 +; GFX12-NEXT: s_mov_b32 s42, s5 +; GFX12-NEXT: v_dual_mov_b32 v8, s48 :: v_dual_mov_b32 v11, s51 +; GFX12-NEXT: v_dual_mov_b32 v10, s50 :: v_dual_mov_b32 v13, s73 +; GFX12-NEXT: s_lshr_b32 s56, s4, 16 +; GFX12-NEXT: s_lshr_b32 s58, s4, 24 +; GFX12-NEXT: s_ashr_i64 s[70:71], s[4:5], 56 +; GFX12-NEXT: v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s53 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v14, s52 +; GFX12-NEXT: s_lshr_b32 s60, s4, 8 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX12-NEXT: s_lshr_b32 s44, s3, 16 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v0, s36 +; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24 ; GFX12-NEXT: v_dual_mov_b32 v3, s71 :: v_dual_mov_b32 v2, s70 -; GFX12-NEXT: v_mov_b32_e32 v5, s53 -; GFX12-NEXT: s_lshr_b32 s34, s3, 8 -; GFX12-NEXT: s_mov_b32 s30, s3 -; GFX12-NEXT: s_lshr_b32 s24, s2, 16 +; GFX12-NEXT: v_mov_b32_e32 v5, s43 +; GFX12-NEXT: s_lshr_b32 s36, s3, 8 +; GFX12-NEXT: s_mov_b32 s34, s3 +; GFX12-NEXT: s_lshr_b32 s26, s2, 16 ; GFX12-NEXT: s_lshr_b32 s22, s2, 24 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s51 -; GFX12-NEXT: v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s55 -; GFX12-NEXT: s_lshr_b32 s20, s2, 8 -; GFX12-NEXT: s_ashr_i64 s[26:27], s[2:3], 56 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s57 -; GFX12-NEXT: v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s29 +; GFX12-NEXT: v_dual_mov_b32 v4, s42 :: v_dual_mov_b32 v7, s55 +; GFX12-NEXT: v_dual_mov_b32 v6, s54 :: v_dual_mov_b32 v9, s57 +; GFX12-NEXT: s_lshr_b32 s20, s2, 8 +; GFX12-NEXT: s_ashr_i64 s[28:29], s[2:3], 56 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 +; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31 ; GFX12-NEXT: s_lshr_b32 s18, s1, 16 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s59 -; GFX12-NEXT: v_dual_mov_b32 v14, s58 :: v_dual_mov_b32 v17, s61 -; GFX12-NEXT: s_lshr_b32 s14, s1, 8 -; GFX12-NEXT: s_mov_b32 s62, s1 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61 +; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s45 +; GFX12-NEXT: s_lshr_b32 s62, s1, 8 +; GFX12-NEXT: s_mov_b32 s14, s1 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s60 :: v_dual_mov_b32 v19, s27 -; GFX12-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v21, s31 +; GFX12-NEXT: v_dual_mov_b32 v16, s44 :: v_dual_mov_b32 v19, s29 +; GFX12-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v21, s35 ; GFX12-NEXT: s_lshr_b32 s64, s0, 16 ; GFX12-NEXT: s_lshr_b32 s66, s0, 24 ; GFX12-NEXT: s_ashr_i64 s[12:13], s[0:1], 56 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s35 -; GFX12-NEXT: v_mov_b32_e32 v22, s34 +; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37 +; GFX12-NEXT: v_mov_b32_e32 v22, s36 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 @@ -9044,12 +9122,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24 +; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s26 ; GFX12-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 ; GFX12-NEXT: v_mov_b32_e32 v5, s17 ; GFX12-NEXT: s_lshr_b32 s68, s0, 8 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s21 ; GFX12-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v9, s19 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll index 5fc1a87e71a1a..bea1ca1bc05b5 100644 --- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll @@ -112,8 +112,9 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_lshlrev_b32_e32 v1, 23, v0 -; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 23, v0 +; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[1:2] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: range_metadata_sext_i8_signed_range_i64: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index c5c95380fde9b..5b94398908a56 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -463,12 +463,13 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { ; SI-LABEL: mad_i64_i32_sextops_i31_i63: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33 -; SI-NEXT: v_mul_lo_u32 v1, v4, v0 -; SI-NEXT: v_mul_hi_i32 v4, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v0 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_ashr_i64 v[6:7], v[4:5], 33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v1 +; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 33 +; SI-NEXT: v_mul_lo_u32 v1, v6, v0 +; SI-NEXT: v_mul_hi_i32 v4, v6, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 11cf129b1e479..9b733a1c6012f 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -464,19 +464,20 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s6, s[4:5], 0xb -; SI-NEXT: s_load_dword s4, s[4:5], 0xd +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_lshl_b32 s7, s4, 8 -; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; SI-NEXT: s_lshl_b32 s5, s2, 8 +; SI-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mul_i32 s5, s4, s6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s4 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s6, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -484,17 +485,18 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s3, s[4:5], 0x2c ; VI-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 8 -; VI-NEXT: s_lshl_b32 s5, s6, 8 -; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 +; VI-NEXT: s_lshl_b32 s3, s6, 8 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 -; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; VI-NEXT: v_mul_i32_i24_e32 v0, s4, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] @@ -506,19 +508,20 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s5, s8, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s5, s6, s4 +; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -610,16 +613,17 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s5, s8, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s4, s6, s4 ; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index a166c4f93462d..885c0829a88c3 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -461,12 +461,12 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -491,12 +491,12 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -684,19 +684,19 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s2, 0, s9 +; GCN-NEXT: s_sub_i32 s4, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_abs_i32 s0, s2 ; GCN-NEXT: s_xor_b32 s1, s2, s8 -; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_ashr_i32 s1, s1, 31 +; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_abs_i32 s0, s2 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 @@ -720,7 +720,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -728,19 +728,19 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 +; GCN-IR-NEXT: s_sub_i32 s4, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_abs_i32 s0, s2 ; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 -; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_abs_i32 s0, s2 +; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 @@ -772,12 +772,12 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -802,12 +802,12 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -838,7 +838,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -846,19 +846,19 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s2, 0, s9 +; GCN-NEXT: s_sub_i32 s4, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_abs_i32 s0, s2 ; GCN-NEXT: s_xor_b32 s1, s2, s8 -; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_ashr_i32 s1, s1, 31 +; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_abs_i32 s0, s2 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 @@ -882,7 +882,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -890,19 +890,19 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 +; GCN-IR-NEXT: s_sub_i32 s4, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_abs_i32 s0, s2 ; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 -; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_abs_i32 s0, s2 +; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 65a17ed67481c..9b937e6524559 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -149,11 +149,12 @@ define i128 @v_lshr_i128_kv(i128 %rhs) { ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0x41 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0x41 +; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -167,10 +168,11 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c9e5ff444f715..f7e0b7ea79437 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -443,12 +443,12 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -478,12 +478,12 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -520,12 +520,12 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -555,12 +555,12 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -650,7 +650,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -658,16 +658,16 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: s_sub_i32 s4, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 ; GCN-NEXT: s_abs_i32 s3, s2 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -691,7 +691,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -699,16 +699,16 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s4, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 ; GCN-IR-NEXT: s_abs_i32 s3, s2 +; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -739,7 +739,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -747,16 +747,16 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: s_sub_i32 s4, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 ; GCN-NEXT: s_abs_i32 s3, s2 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -780,7 +780,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -788,16 +788,16 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s4, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 ; GCN-IR-NEXT: s_abs_i32 s3, s2 +; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll index 4d091c2302658..8c3f2880f22a2 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -26,7 +26,7 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: vmov.32 d9[1], r5 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: mvn r4, #0 +; CHECK-NEXT: mvn r12, #0 ; CHECK-NEXT: movwlt r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: mvnne r5, #0 @@ -34,7 +34,6 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: vmov.32 d8[1], r1 ; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vmov.i32 q10, #0x80000000 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vdup.32 d19, r5 @@ -43,22 +42,24 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: vdup.32 d18, r0 ; CHECK-NEXT: vbit q8, q4, q9 +; CHECK-NEXT: adr r4, .LCPI0_1 +; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128] ; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: vmov r3, r5, d16 ; CHECK-NEXT: rsbs r0, r0, #-2147483648 -; CHECK-NEXT: sbcs r0, r4, r1 +; CHECK-NEXT: sbcs r0, r12, r1 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: rsbs r1, r3, #-2147483648 -; CHECK-NEXT: sbcs r1, r4, r5 -; CHECK-NEXT: vdup.32 d19, r0 +; CHECK-NEXT: sbcs r1, r12, r5 +; CHECK-NEXT: vdup.32 d21, r0 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d18, r2 -; CHECK-NEXT: vbif q8, q10, q9 +; CHECK-NEXT: vdup.32 d20, r2 +; CHECK-NEXT: vbif q8, q9, q10 ; CHECK-NEXT: vmovn.i64 d0, q8 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r11, pc} @@ -69,6 +70,11 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff ; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff entry: %conv = fptosi <2 x double> %x to <2 x i64> %0 = icmp slt <2 x i64> %conv, @@ -94,20 +100,22 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: vmov.32 d9[0], r4 ; CHECK-NEXT: bl __aeabi_d2ulz +; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mvn r3, #0 ; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vmov.32 d9[1], r5 ; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movwlo r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: mvnne r5, #0 ; CHECK-NEXT: subs r0, r0, r3 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vdup.32 d17, r5 +; CHECK-NEXT: vmov.32 d8[1], r1 ; CHECK-NEXT: movwlo r2, #1 ; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vdup.32 d17, r5 ; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: vdup.32 d16, r2 ; CHECK-NEXT: vand q9, q4, q8 @@ -169,11 +177,11 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: rsbs r1, r3, #0 ; CHECK-NEXT: rscs r1, r5, #0 -; CHECK-NEXT: vmov.32 d19[0], r0 +; CHECK-NEXT: vdup.32 d19, r0 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vmov.32 d18[0], r2 +; CHECK-NEXT: vdup.32 d18, r2 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vmovn.i64 d0, q8 ; CHECK-NEXT: vpop {d8, d9} @@ -324,57 +332,61 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vorr q4, q0, q0 -; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: vmov r5, s19 -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: vmov.32 d9[0], r10 -; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: vmov.32 d8[0], r0 -; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: vmov.32 d9[0], r8 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: vmov.32 d10[0], r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov.32 d11[0], r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __aeabi_f2ulz +; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mvn r3, #0 -; CHECK-NEXT: vmov.32 d10[0], r0 ; CHECK-NEXT: subs r0, r0, r3 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov.32 d9[1], r9 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlo r0, #1 ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov.32 d8[1], r1 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: subs r1, r5, r3 -; CHECK-NEXT: sbcs r1, r4, #0 +; CHECK-NEXT: subs r1, r8, r3 +; CHECK-NEXT: sbcs r1, r9, #0 +; CHECK-NEXT: vmov.32 d11[1], r5 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movwlo r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: subs r7, r10, r3 -; CHECK-NEXT: sbcs r7, r8, #0 +; CHECK-NEXT: subs r6, r4, r3 +; CHECK-NEXT: sbcs r6, r5, #0 ; CHECK-NEXT: vdup.32 d19, r1 -; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: mov r6, #0 ; CHECK-NEXT: vdup.32 d18, r0 -; CHECK-NEXT: movwlo r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: subs r3, r6, r3 -; CHECK-NEXT: sbcs r3, r9, #0 -; CHECK-NEXT: vdup.32 d17, r7 +; CHECK-NEXT: movwlo r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: subs r3, r10, r3 +; CHECK-NEXT: sbcs r3, r7, #0 +; CHECK-NEXT: vmov.32 d10[1], r7 ; CHECK-NEXT: movwlo r2, #1 ; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vdup.32 d17, r6 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vand q10, q5, q9 +; CHECK-NEXT: vand q10, q4, q9 ; CHECK-NEXT: vdup.32 d16, r2 -; CHECK-NEXT: vand q11, q4, q8 +; CHECK-NEXT: vand q11, q5, q8 ; CHECK-NEXT: vorn q9, q10, q9 ; CHECK-NEXT: vorn q8, q11, q8 ; CHECK-NEXT: vmovn.i64 d1, q9 @@ -397,45 +409,46 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vorr q4, q0, q0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r5, s17 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.32 d17[0], r2 +; CHECK-NEXT: mvn r4, #0 +; CHECK-NEXT: subs r2, r2, r4 ; CHECK-NEXT: vmov r8, s16 -; CHECK-NEXT: vmov.32 d9[0], r6 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mvn r9, #0 -; CHECK-NEXT: subs r2, r6, r9 -; CHECK-NEXT: sbcs r2, r7, #0 -; CHECK-NEXT: vmov.32 d8[0], r0 -; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vmov.32 d16[0], r5 ; CHECK-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: vmov.32 d17[1], r1 +; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: movwlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: subs r2, r5, r4 +; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: vdup.32 d19, r1 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vmov.32 d16[1], r6 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r0, r0, r9 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov.32 d9[1], r7 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movwlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov.32 d8[1], r1 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d17, r2 -; CHECK-NEXT: vdup.32 d16, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: vbif q4, q5, q8 +; CHECK-NEXT: vdup.32 d18, r2 +; CHECK-NEXT: vorr q4, q9, q9 +; CHECK-NEXT: vbsl q4, q8, q5 +; CHECK-NEXT: vmov r10, r9, d8 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov.32 d13[0], r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: vmov r7, r10, d8 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: subs r2, r5, r9 +; CHECK-NEXT: subs r2, r5, r4 ; CHECK-NEXT: vmov.32 d12[0], r0 ; CHECK-NEXT: sbcs r2, r6, #0 ; CHECK-NEXT: mov r2, #0 @@ -443,25 +456,25 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r0, r0, r9 +; CHECK-NEXT: subs r0, r0, r4 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vdup.32 d17, r2 -; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: vmov.32 d12[1], r1 +; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vdup.32 d17, r2 ; CHECK-NEXT: vdup.32 d16, r0 -; CHECK-NEXT: rsbs r7, r7, #0 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: vbsl q8, q6, q5 -; CHECK-NEXT: rscs r7, r10, #0 -; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: movwlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vmov r0, r1, d17 -; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: vmov r6, r5, d16 +; CHECK-NEXT: rsbs r6, r10, #0 +; CHECK-NEXT: rscs r6, r9, #0 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: movwlt r6, #1 +; CHECK-NEXT: vmov r5, r4, d16 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: mvnne r6, #0 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rscs r0, r1, #0 ; CHECK-NEXT: mov r0, #0 @@ -470,20 +483,20 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: rscs r1, r3, #0 -; CHECK-NEXT: vmov.32 d19[0], r0 +; CHECK-NEXT: vdup.32 d21, r0 ; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: vdup.32 d20, r6 ; CHECK-NEXT: movwlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: rsbs r0, r6, #0 -; CHECK-NEXT: rscs r0, r5, #0 -; CHECK-NEXT: vmov.32 d21[0], r1 -; CHECK-NEXT: movwlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vmov.32 d20[0], r7 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vmov.32 d18[0], r4 +; CHECK-NEXT: rsbs r2, r5, #0 +; CHECK-NEXT: rscs r2, r4, #0 +; CHECK-NEXT: vdup.32 d19, r1 +; CHECK-NEXT: movwlt r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: mvnne r7, #0 ; CHECK-NEXT: vand q10, q4, q10 +; CHECK-NEXT: vdup.32 d18, r7 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vmovn.i64 d1, q10 ; CHECK-NEXT: vmovn.i64 d0, q8 @@ -751,141 +764,151 @@ entry: define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NEON-LABEL: utesth_f16i32: ; CHECK-NEON: @ %bb.0: @ %entry -; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEON-NEXT: .vsave {d12, d13} -; CHECK-NEON-NEXT: vpush {d12, d13} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10} -; CHECK-NEON-NEXT: vpush {d8, d9, d10} +; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEON-NEXT: vmov r0, s3 ; CHECK-NEON-NEXT: vmov.f32 s16, s2 ; CHECK-NEON-NEXT: vmov.f32 s18, s1 ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r4, r0 -; CHECK-NEON-NEXT: vmov r0, s18 -; CHECK-NEON-NEXT: mov r8, r1 -; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r6, r0 -; CHECK-NEON-NEXT: vmov.32 d13[0], r0 +; CHECK-NEON-NEXT: mov r8, r0 ; CHECK-NEON-NEXT: vmov r0, s20 ; CHECK-NEON-NEXT: mov r9, r1 +; CHECK-NEON-NEXT: vmov r4, s18 +; CHECK-NEON-NEXT: vmov r6, s16 +; CHECK-NEON-NEXT: vmov.32 d9[0], r8 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r5, r0 -; CHECK-NEON-NEXT: vmov.32 d12[0], r0 -; CHECK-NEON-NEXT: vmov r0, s16 +; CHECK-NEON-NEXT: mov r10, r0 +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 +; CHECK-NEON-NEXT: mov r0, r4 ; CHECK-NEON-NEXT: mov r7, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: vmov.32 d9[0], r4 ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mvn r3, #0 +; CHECK-NEON-NEXT: mov r4, r0 +; CHECK-NEON-NEXT: vmov.32 d11[0], r0 +; CHECK-NEON-NEXT: mov r0, r6 +; CHECK-NEON-NEXT: mov r5, r1 +; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: bl __aeabi_f2ulz ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: mvn r3, #0 ; CHECK-NEON-NEXT: subs r0, r0, r3 ; CHECK-NEON-NEXT: mov r2, #0 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 +; CHECK-NEON-NEXT: vmov.32 d9[1], r9 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlo r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 +; CHECK-NEON-NEXT: vmov.32 d8[1], r1 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: subs r1, r4, r3 -; CHECK-NEON-NEXT: sbcs r1, r8, #0 +; CHECK-NEON-NEXT: subs r1, r8, r3 +; CHECK-NEON-NEXT: sbcs r1, r9, #0 +; CHECK-NEON-NEXT: vmov.32 d11[1], r5 ; CHECK-NEON-NEXT: mov r1, #0 ; CHECK-NEON-NEXT: movwlo r1, #1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: subs r6, r6, r3 -; CHECK-NEON-NEXT: sbcs r6, r9, #0 +; CHECK-NEON-NEXT: subs r6, r4, r3 +; CHECK-NEON-NEXT: sbcs r6, r5, #0 ; CHECK-NEON-NEXT: vdup.32 d19, r1 ; CHECK-NEON-NEXT: mov r6, #0 ; CHECK-NEON-NEXT: vdup.32 d18, r0 ; CHECK-NEON-NEXT: movwlo r6, #1 ; CHECK-NEON-NEXT: cmp r6, #0 ; CHECK-NEON-NEXT: mvnne r6, #0 -; CHECK-NEON-NEXT: subs r3, r5, r3 +; CHECK-NEON-NEXT: subs r3, r10, r3 ; CHECK-NEON-NEXT: sbcs r3, r7, #0 -; CHECK-NEON-NEXT: vdup.32 d17, r6 +; CHECK-NEON-NEXT: vmov.32 d10[1], r7 ; CHECK-NEON-NEXT: movwlo r2, #1 ; CHECK-NEON-NEXT: cmp r2, #0 +; CHECK-NEON-NEXT: vdup.32 d17, r6 ; CHECK-NEON-NEXT: mvnne r2, #0 ; CHECK-NEON-NEXT: vand q10, q4, q9 ; CHECK-NEON-NEXT: vdup.32 d16, r2 -; CHECK-NEON-NEXT: vand q11, q6, q8 +; CHECK-NEON-NEXT: vand q11, q5, q8 ; CHECK-NEON-NEXT: vorn q9, q10, q9 ; CHECK-NEON-NEXT: vorn q8, q11, q8 ; CHECK-NEON-NEXT: vmovn.i64 d1, q9 ; CHECK-NEON-NEXT: vmovn.i64 d0, q8 -; CHECK-NEON-NEXT: vpop {d8, d9, d10} -; CHECK-NEON-NEXT: vpop {d12, d13} -; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} ; ; CHECK-FP16-LABEL: utesth_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry ; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11} -; CHECK-FP16-NEXT: vmov.u16 r0, d0[1] +; CHECK-FP16-NEXT: .vsave {d10, d11, d12, d13} +; CHECK-FP16-NEXT: vpush {d10, d11, d12, d13} +; CHECK-FP16-NEXT: .vsave {d8} +; CHECK-FP16-NEXT: vpush {d8} +; CHECK-FP16-NEXT: vmov.u16 r0, d0[3] ; CHECK-FP16-NEXT: vorr d8, d0, d0 -; CHECK-FP16-NEXT: vmov.u16 r5, d0[3] +; CHECK-FP16-NEXT: vmov.u16 r4, d0[1] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: mov r4, r0 +; CHECK-FP16-NEXT: mov r6, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] -; CHECK-FP16-NEXT: mov r8, r1 -; CHECK-FP16-NEXT: vmov.32 d11[0], r4 +; CHECK-FP16-NEXT: mov r7, r1 +; CHECK-FP16-NEXT: vmov.32 d11[0], r6 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: vmov s0, r5 -; CHECK-FP16-NEXT: mov r6, r0 +; CHECK-FP16-NEXT: vmov s0, r4 +; CHECK-FP16-NEXT: mov r8, r0 ; CHECK-FP16-NEXT: mov r9, r1 -; CHECK-FP16-NEXT: vmov.32 d10[0], r0 +; CHECK-FP16-NEXT: vmov.32 d12[0], r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: mov r5, r0 +; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] -; CHECK-FP16-NEXT: mov r7, r1 -; CHECK-FP16-NEXT: vmov.32 d9[0], r5 +; CHECK-FP16-NEXT: mov r5, r1 +; CHECK-FP16-NEXT: vmov.32 d13[0], r4 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi +; CHECK-FP16-NEXT: vmov.32 d10[0], r0 ; CHECK-FP16-NEXT: mvn r3, #0 -; CHECK-FP16-NEXT: vmov.32 d8[0], r0 ; CHECK-FP16-NEXT: subs r0, r0, r3 ; CHECK-FP16-NEXT: mov r2, #0 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 +; CHECK-FP16-NEXT: vmov.32 d11[1], r7 ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: movwlo r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 +; CHECK-FP16-NEXT: vmov.32 d10[1], r1 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: subs r1, r5, r3 +; CHECK-FP16-NEXT: subs r1, r6, r3 ; CHECK-FP16-NEXT: sbcs r1, r7, #0 +; CHECK-FP16-NEXT: vmov.32 d13[1], r5 ; CHECK-FP16-NEXT: mov r1, #0 ; CHECK-FP16-NEXT: movwlo r1, #1 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 ; CHECK-FP16-NEXT: subs r7, r4, r3 -; CHECK-FP16-NEXT: sbcs r7, r8, #0 +; CHECK-FP16-NEXT: sbcs r7, r5, #0 ; CHECK-FP16-NEXT: vdup.32 d19, r1 ; CHECK-FP16-NEXT: mov r7, #0 ; CHECK-FP16-NEXT: vdup.32 d18, r0 ; CHECK-FP16-NEXT: movwlo r7, #1 ; CHECK-FP16-NEXT: cmp r7, #0 ; CHECK-FP16-NEXT: mvnne r7, #0 -; CHECK-FP16-NEXT: subs r3, r6, r3 +; CHECK-FP16-NEXT: subs r3, r8, r3 ; CHECK-FP16-NEXT: sbcs r3, r9, #0 -; CHECK-FP16-NEXT: vdup.32 d17, r7 +; CHECK-FP16-NEXT: vmov.32 d12[1], r9 ; CHECK-FP16-NEXT: movwlo r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 +; CHECK-FP16-NEXT: vdup.32 d17, r7 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: vand q10, q4, q9 +; CHECK-FP16-NEXT: vand q10, q5, q9 ; CHECK-FP16-NEXT: vdup.32 d16, r2 -; CHECK-FP16-NEXT: vand q11, q5, q8 +; CHECK-FP16-NEXT: vand q11, q6, q8 ; CHECK-FP16-NEXT: vorn q9, q10, q9 ; CHECK-FP16-NEXT: vorn q8, q11, q8 ; CHECK-FP16-NEXT: vmovn.i64 d1, q9 ; CHECK-FP16-NEXT: vmovn.i64 d0, q8 -; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11} +; CHECK-FP16-NEXT: vpop {d8} +; CHECK-FP16-NEXT: vpop {d10, d11, d12, d13} ; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} entry: %conv = fptoui <4 x half> %x to <4 x i64> @@ -902,8 +925,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEON-NEXT: vmov r0, s3 -; CHECK-NEON-NEXT: vmov.f32 s16, s2 +; CHECK-NEON-NEXT: vmov r0, s2 +; CHECK-NEON-NEXT: vmov.f32 s16, s3 ; CHECK-NEON-NEXT: vmov.f32 s18, s1 ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f @@ -912,42 +935,43 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: mov r6, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: vmov r8, s20 -; CHECK-NEON-NEXT: vmov.32 d13[0], r5 ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: vmov r2, s18 -; CHECK-NEON-NEXT: vmov.32 d12[0], r0 -; CHECK-NEON-NEXT: mvn r9, #0 -; CHECK-NEON-NEXT: subs r0, r0, r9 -; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vmov.32 d13[1], r6 -; CHECK-NEON-NEXT: mov r0, #0 +; CHECK-NEON-NEXT: mov r2, r0 +; CHECK-NEON-NEXT: vmov r0, s18 +; CHECK-NEON-NEXT: vmov.32 d17[0], r2 +; CHECK-NEON-NEXT: mvn r8, #0 +; CHECK-NEON-NEXT: subs r2, r2, r8 +; CHECK-NEON-NEXT: vmov r4, s20 +; CHECK-NEON-NEXT: vmov.32 d16[0], r5 +; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEON-NEXT: mov r7, #0 -; CHECK-NEON-NEXT: movwlt r0, #1 -; CHECK-NEON-NEXT: cmp r0, #0 -; CHECK-NEON-NEXT: vmov.32 d12[1], r1 -; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: subs r1, r5, r9 -; CHECK-NEON-NEXT: sbcs r1, r6, #0 +; CHECK-NEON-NEXT: vmov.32 d17[1], r1 +; CHECK-NEON-NEXT: sbcs r1, r1, #0 ; CHECK-NEON-NEXT: mov r1, #0 ; CHECK-NEON-NEXT: movwlt r1, #1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: vdup.32 d9, r1 -; CHECK-NEON-NEXT: vdup.32 d8, r0 -; CHECK-NEON-NEXT: mov r0, r2 +; CHECK-NEON-NEXT: subs r2, r5, r8 +; CHECK-NEON-NEXT: sbcs r2, r6, #0 +; CHECK-NEON-NEXT: vdup.32 d19, r1 +; CHECK-NEON-NEXT: mov r2, #0 +; CHECK-NEON-NEXT: vmov.32 d16[1], r6 +; CHECK-NEON-NEXT: movwlt r2, #1 +; CHECK-NEON-NEXT: cmp r2, #0 +; CHECK-NEON-NEXT: mvnne r2, #0 +; CHECK-NEON-NEXT: vdup.32 d18, r2 +; CHECK-NEON-NEXT: vorr q4, q9, q9 +; CHECK-NEON-NEXT: vbsl q4, q8, q5 +; CHECK-NEON-NEXT: vmov r10, r9, d8 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEON-NEXT: vbsl q4, q6, q5 ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: mov r5, r0 ; CHECK-NEON-NEXT: vmov.32 d13[0], r0 -; CHECK-NEON-NEXT: mov r0, r8 +; CHECK-NEON-NEXT: mov r0, r4 ; CHECK-NEON-NEXT: mov r6, r1 -; CHECK-NEON-NEXT: vmov r4, r10, d8 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: subs r2, r5, r9 +; CHECK-NEON-NEXT: subs r2, r5, r8 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 ; CHECK-NEON-NEXT: sbcs r2, r6, #0 ; CHECK-NEON-NEXT: mov r2, #0 @@ -955,25 +979,25 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: movwlt r2, #1 ; CHECK-NEON-NEXT: cmp r2, #0 ; CHECK-NEON-NEXT: mvnne r2, #0 -; CHECK-NEON-NEXT: subs r0, r0, r9 +; CHECK-NEON-NEXT: subs r0, r0, r8 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vdup.32 d17, r2 -; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: vmov.32 d12[1], r1 +; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: vmov r2, r3, d9 +; CHECK-NEON-NEXT: vdup.32 d17, r2 ; CHECK-NEON-NEXT: vdup.32 d16, r0 -; CHECK-NEON-NEXT: rsbs r6, r4, #0 +; CHECK-NEON-NEXT: vmov r0, r1, d9 ; CHECK-NEON-NEXT: vbsl q8, q6, q5 -; CHECK-NEON-NEXT: rscs r6, r10, #0 +; CHECK-NEON-NEXT: rsbs r6, r10, #0 +; CHECK-NEON-NEXT: rscs r6, r9, #0 ; CHECK-NEON-NEXT: mov r6, #0 +; CHECK-NEON-NEXT: vmov r2, r3, d17 ; CHECK-NEON-NEXT: movwlt r6, #1 +; CHECK-NEON-NEXT: vmov r5, r4, d16 ; CHECK-NEON-NEXT: cmp r6, #0 -; CHECK-NEON-NEXT: vmov r0, r1, d17 ; CHECK-NEON-NEXT: mvnne r6, #0 -; CHECK-NEON-NEXT: vmov r5, r4, d16 ; CHECK-NEON-NEXT: rsbs r0, r0, #0 ; CHECK-NEON-NEXT: rscs r0, r1, #0 ; CHECK-NEON-NEXT: mov r0, #0 @@ -982,20 +1006,20 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: mvnne r0, #0 ; CHECK-NEON-NEXT: rsbs r1, r2, #0 ; CHECK-NEON-NEXT: rscs r1, r3, #0 -; CHECK-NEON-NEXT: vmov.32 d19[0], r0 +; CHECK-NEON-NEXT: vdup.32 d21, r0 ; CHECK-NEON-NEXT: mov r1, #0 +; CHECK-NEON-NEXT: vdup.32 d20, r6 ; CHECK-NEON-NEXT: movwlt r1, #1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: rsbs r0, r5, #0 -; CHECK-NEON-NEXT: rscs r0, r4, #0 -; CHECK-NEON-NEXT: vmov.32 d21[0], r1 +; CHECK-NEON-NEXT: rsbs r2, r5, #0 +; CHECK-NEON-NEXT: rscs r2, r4, #0 +; CHECK-NEON-NEXT: vdup.32 d19, r1 ; CHECK-NEON-NEXT: movwlt r7, #1 ; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: vmov.32 d20[0], r6 ; CHECK-NEON-NEXT: mvnne r7, #0 -; CHECK-NEON-NEXT: vmov.32 d18[0], r7 ; CHECK-NEON-NEXT: vand q10, q4, q10 +; CHECK-NEON-NEXT: vdup.32 d18, r7 ; CHECK-NEON-NEXT: vand q8, q8, q9 ; CHECK-NEON-NEXT: vmovn.i64 d1, q10 ; CHECK-NEON-NEXT: vmovn.i64 d0, q8 @@ -1004,78 +1028,78 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; ; CHECK-FP16-LABEL: ustest_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[3] ; CHECK-FP16-NEXT: vorr d8, d0, d0 -; CHECK-FP16-NEXT: vmov.u16 r8, d0[0] -; CHECK-FP16-NEXT: vmov.u16 r9, d0[1] +; CHECK-FP16-NEXT: vmov.u16 r8, d0[1] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] ; CHECK-FP16-NEXT: mov r5, r1 -; CHECK-FP16-NEXT: vmov.32 d9[0], r4 +; CHECK-FP16-NEXT: vmov.32 d11[0], r4 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: mvn r10, #0 -; CHECK-FP16-NEXT: subs r2, r4, r10 +; CHECK-FP16-NEXT: mvn r7, #0 +; CHECK-FP16-NEXT: subs r2, r4, r7 ; CHECK-FP16-NEXT: sbcs r2, r5, #0 -; CHECK-FP16-NEXT: vmov.32 d8[0], r0 +; CHECK-FP16-NEXT: vmov.32 d10[0], r0 ; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: vmov s0, r9 +; CHECK-FP16-NEXT: vmov.i64 q6, #0xffffffff ; CHECK-FP16-NEXT: movwlt r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: subs r0, r0, r10 +; CHECK-FP16-NEXT: subs r0, r0, r7 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 -; CHECK-FP16-NEXT: vmov.32 d9[1], r5 +; CHECK-FP16-NEXT: vmov.32 d11[1], r5 ; CHECK-FP16-NEXT: mov r0, #0 -; CHECK-FP16-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-FP16-NEXT: vmov s0, r8 ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 -; CHECK-FP16-NEXT: vmov.32 d8[1], r1 +; CHECK-FP16-NEXT: vmov.32 d10[1], r1 ; CHECK-FP16-NEXT: mvnne r0, #0 ; CHECK-FP16-NEXT: mov r6, #0 ; CHECK-FP16-NEXT: vdup.32 d17, r2 ; CHECK-FP16-NEXT: vdup.32 d16, r0 -; CHECK-FP16-NEXT: vbif q4, q5, q8 +; CHECK-FP16-NEXT: vbif q5, q6, q8 +; CHECK-FP16-NEXT: vmov r9, r8, d10 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov s0, r8 ; CHECK-FP16-NEXT: mov r4, r0 +; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] ; CHECK-FP16-NEXT: mov r5, r1 -; CHECK-FP16-NEXT: vmov.32 d13[0], r0 -; CHECK-FP16-NEXT: vmov r7, r8, d8 +; CHECK-FP16-NEXT: vmov.32 d9[0], r4 +; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: subs r2, r4, r10 -; CHECK-FP16-NEXT: vmov.32 d12[0], r0 +; CHECK-FP16-NEXT: subs r2, r4, r7 +; CHECK-FP16-NEXT: vmov.32 d8[0], r0 ; CHECK-FP16-NEXT: sbcs r2, r5, #0 ; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: vmov.32 d13[1], r5 +; CHECK-FP16-NEXT: vmov.32 d9[1], r5 ; CHECK-FP16-NEXT: movwlt r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: subs r0, r0, r10 +; CHECK-FP16-NEXT: subs r0, r0, r7 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 -; CHECK-FP16-NEXT: vdup.32 d17, r2 +; CHECK-FP16-NEXT: vmov.32 d8[1], r1 ; CHECK-FP16-NEXT: mov r0, #0 -; CHECK-FP16-NEXT: vmov.32 d12[1], r1 ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: vmov r2, r3, d9 +; CHECK-FP16-NEXT: vdup.32 d17, r2 ; CHECK-FP16-NEXT: vdup.32 d16, r0 -; CHECK-FP16-NEXT: rsbs r7, r7, #0 -; CHECK-FP16-NEXT: vbsl q8, q6, q5 +; CHECK-FP16-NEXT: vmov r0, r1, d11 +; CHECK-FP16-NEXT: vbsl q8, q4, q6 +; CHECK-FP16-NEXT: rsbs r7, r9, #0 ; CHECK-FP16-NEXT: rscs r7, r8, #0 ; CHECK-FP16-NEXT: mov r7, #0 +; CHECK-FP16-NEXT: vmov r2, r3, d17 ; CHECK-FP16-NEXT: movwlt r7, #1 +; CHECK-FP16-NEXT: vmov r5, r4, d16 ; CHECK-FP16-NEXT: cmp r7, #0 -; CHECK-FP16-NEXT: vmov r0, r1, d17 ; CHECK-FP16-NEXT: mvnne r7, #0 -; CHECK-FP16-NEXT: vmov r5, r4, d16 ; CHECK-FP16-NEXT: rsbs r0, r0, #0 ; CHECK-FP16-NEXT: rscs r0, r1, #0 ; CHECK-FP16-NEXT: mov r0, #0 @@ -1084,25 +1108,25 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: mvnne r0, #0 ; CHECK-FP16-NEXT: rsbs r1, r2, #0 ; CHECK-FP16-NEXT: rscs r1, r3, #0 -; CHECK-FP16-NEXT: vmov.32 d19[0], r0 +; CHECK-FP16-NEXT: vdup.32 d21, r0 ; CHECK-FP16-NEXT: mov r1, #0 +; CHECK-FP16-NEXT: vdup.32 d20, r7 ; CHECK-FP16-NEXT: movwlt r1, #1 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 -; CHECK-FP16-NEXT: rsbs r0, r5, #0 -; CHECK-FP16-NEXT: rscs r0, r4, #0 -; CHECK-FP16-NEXT: vmov.32 d21[0], r1 +; CHECK-FP16-NEXT: rsbs r2, r5, #0 +; CHECK-FP16-NEXT: rscs r2, r4, #0 +; CHECK-FP16-NEXT: vdup.32 d19, r1 ; CHECK-FP16-NEXT: movwlt r6, #1 ; CHECK-FP16-NEXT: cmp r6, #0 -; CHECK-FP16-NEXT: vmov.32 d20[0], r7 ; CHECK-FP16-NEXT: mvnne r6, #0 -; CHECK-FP16-NEXT: vmov.32 d18[0], r6 -; CHECK-FP16-NEXT: vand q10, q4, q10 +; CHECK-FP16-NEXT: vand q10, q5, q10 +; CHECK-FP16-NEXT: vdup.32 d18, r6 ; CHECK-FP16-NEXT: vand q8, q8, q9 ; CHECK-FP16-NEXT: vmovn.i64 d1, q10 ; CHECK-FP16-NEXT: vmovn.i64 d0, q8 ; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll index 742f2a75a1aa8..0134ee48ad421 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -279,40 +279,42 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.u16 r0, q4[3] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: rsbs r2, r4, #0 ; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: sbcs.w r0, r6, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r1, r0, #0, #8 -; CHECK-NEXT: rsbs r0, r4, #0 -; CHECK-NEXT: sbcs.w r0, r6, r5 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmov.u16 r0, q4[0] ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: bfi r1, r0, #8, #8 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmov.u16 r0, q4[1] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r0, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: sbcs.w r0, r6, r1 -; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: sbcs.w r1, r6, r5 -; CHECK-NEXT: bfi r6, r0, #0, #8 +; CHECK-NEXT: bfi r6, r2, #0, #8 ; CHECK-NEXT: csetm r0, lt ; CHECK-NEXT: bfi r6, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r6 @@ -1351,40 +1353,42 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.u16 r0, q4[3] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: rsbs r2, r4, #0 ; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: sbcs.w r0, r6, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bfi r1, r0, #0, #8 -; CHECK-NEXT: rsbs r0, r4, #0 -; CHECK-NEXT: sbcs.w r0, r6, r5 +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: vmov.u16 r0, q4[0] ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: bfi r1, r0, #8, #8 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmov.u16 r0, q4[1] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r0, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: sbcs.w r2, r6, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: sbcs.w r0, r6, r1 -; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: sbcs.w r1, r6, r5 -; CHECK-NEXT: bfi r6, r0, #0, #8 +; CHECK-NEXT: bfi r6, r2, #0, #8 ; CHECK-NEXT: csetm r0, lt ; CHECK-NEXT: bfi r6, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r6 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index b4a2aa7a1ed1b..5ac88c581f33c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -33,12 +33,7 @@ define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(ptr %base, ptr %offptr) { ; CHECK-LABEL: unscaled_v2i8_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r1] -; CHECK-NEXT: vmov.i32 q0, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ldrb r1, [r0, r1] ; CHECK-NEXT: ldrb r0, [r0, r2] ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index acbe48f9e5927..fe5d7f29f01ff 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -302,35 +302,37 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov r10, s8 ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: asr.w r0, r10, #31 +; CHECK-NEXT: vmov r10, s2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: adds.w r6, r10, r2 -; CHECK-NEXT: eor.w r7, r10, r2 +; CHECK-NEXT: asr.w r0, r10, #31 ; CHECK-NEXT: adc r3, r0, #0 +; CHECK-NEXT: eor.w r1, r10, r2 ; CHECK-NEXT: asrl r6, r3, r2 ; CHECK-NEXT: subs r0, r6, r2 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r6, s12 ; CHECK-NEXT: sbc lr, r3, #0 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: umull r0, r8, r0, r2 -; CHECK-NEXT: asrs r5, r6, #31 +; CHECK-NEXT: vmov.i64 q3, #0xffffffff +; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: adds r4, r6, r3 -; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: eor.w r1, r6, r3 +; CHECK-NEXT: asr.w r7, r6, #31 +; CHECK-NEXT: adc r5, r7, #0 +; CHECK-NEXT: eor.w r7, r6, r3 ; CHECK-NEXT: asrl r4, r5, r3 ; CHECK-NEXT: subs r4, r4, r3 ; CHECK-NEXT: sbc r5, r5, #0 -; CHECK-NEXT: orrs.w r7, r7, r10, asr #31 +; CHECK-NEXT: orrs.w r7, r7, r6, asr #31 ; CHECK-NEXT: umull r4, r12, r4, r3 ; CHECK-NEXT: csetm r9, eq -; CHECK-NEXT: orrs.w r1, r1, r6, asr #31 +; CHECK-NEXT: orrs.w r1, r1, r10, asr #31 ; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: csetm r1, eq ; CHECK-NEXT: bfi r7, r9, #0, #8 @@ -343,47 +345,49 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-NEXT: rsb.w r1, r10, #0 ; CHECK-NEXT: lsll r4, r5, r3 ; CHECK-NEXT: lsll r0, r7, r1 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: lsll r0, r7, r2 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 ; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov q3[3], q3[1], r5, r7 ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: adds r2, r3, r1 -; CHECK-NEXT: asr.w r0, r3, #31 -; CHECK-NEXT: adc r5, r0, #0 -; CHECK-NEXT: asrl r2, r5, r1 -; CHECK-NEXT: subs r0, r2, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sbc r8, r5, #0 -; CHECK-NEXT: umull r4, lr, r0, r1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: asrs r5, r2, #31 -; CHECK-NEXT: adds r6, r2, r0 +; CHECK-NEXT: adds r4, r0, r1 +; CHECK-NEXT: asr.w r2, r0, #31 +; CHECK-NEXT: adc r3, r2, #0 +; CHECK-NEXT: asrl r4, r3, r1 +; CHECK-NEXT: subs r2, r4, r1 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: sbc r8, r3, #0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r2, lr, r2, r1 +; CHECK-NEXT: adds r6, r4, r3 +; CHECK-NEXT: asr.w r5, r4, #31 ; CHECK-NEXT: adc r7, r5, #0 ; CHECK-NEXT: mla r5, r8, r1, lr -; CHECK-NEXT: asrl r6, r7, r0 -; CHECK-NEXT: subs.w r8, r6, r0 -; CHECK-NEXT: eor.w r6, r2, r0 +; CHECK-NEXT: asrl r6, r7, r3 +; CHECK-NEXT: subs.w r8, r6, r3 +; CHECK-NEXT: eor.w r6, r4, r3 ; CHECK-NEXT: sbc lr, r7, #0 -; CHECK-NEXT: eor.w r7, r3, r1 -; CHECK-NEXT: orrs.w r6, r6, r2, asr #31 -; CHECK-NEXT: orr.w r7, r7, r3, asr #31 +; CHECK-NEXT: eor.w r7, r0, r1 +; CHECK-NEXT: orrs.w r6, r6, r4, asr #31 +; CHECK-NEXT: orr.w r7, r7, r0, asr #31 ; CHECK-NEXT: csetm r6, eq ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: bfi r12, r6, #0, #8 ; CHECK-NEXT: csetm r6, eq ; CHECK-NEXT: bfi r12, r6, #8, #8 -; CHECK-NEXT: umull r6, r7, r8, r0 -; CHECK-NEXT: rsb.w r8, r3, #0 -; CHECK-NEXT: lsll r4, r5, r8 +; CHECK-NEXT: umull r6, r7, r8, r3 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: mla r7, lr, r3, r7 +; CHECK-NEXT: lsll r2, r5, r1 ; CHECK-NEXT: vmsr p0, r12 -; CHECK-NEXT: mla r3, lr, r0, r7 -; CHECK-NEXT: lsll r4, r5, r1 -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: lsll r6, r3, r1 -; CHECK-NEXT: lsll r6, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 +; CHECK-NEXT: lsll r6, r7, r0 +; CHECK-NEXT: lsll r6, r7, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s8 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll index 0bec2b100911c..b8d9670710a00 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -420,6 +420,7 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2f64(<2 x double> %src) { ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: csetm r0, eq ; CHECK-MVE-NEXT: vmov q0[2], q0[0], r0, r6 +; CHECK-MVE-NEXT: vmov q0[3], q0[1], r0, r6 ; CHECK-MVE-NEXT: vand q0, q0, q4 ; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 29b56639bd769..5972a9a7cf934 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -11,59 +11,63 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: bne .LBB0_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r10, r5 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %vector.ph ; CHECK-NEXT: bic r2, r3, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: subs r7, r2, #2 ; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: adr r4, .LCPI0_1 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r10, r11, r2, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r1, r2, lsl #2 +; CHECK-NEXT: add.w r10, r5, r2, lsl #2 +; CHECK-NEXT: add.w r11, r1, r2, lsl #2 ; CHECK-NEXT: add.w r12, r0, r2, lsl #2 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vmvn.i32 q1, #0x80000000 +; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrd r4, r2, [r0], #8 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: ldrd r7, r6, [r1], #8 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: ldrd r7, r8, [r1], #8 ; CHECK-NEXT: smull r4, r7, r7, r4 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 -; CHECK-NEXT: mov.w r9, #-1 -; CHECK-NEXT: sbcs.w r3, r9, r7 +; CHECK-NEXT: sbcs r3, r7 +; CHECK-NEXT: mov r9, r5 ; CHECK-NEXT: csetm r3, lt +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: bfi r5, r3, #0, #8 -; CHECK-NEXT: smull r2, r3, r6, r2 +; CHECK-NEXT: smull r2, r3, r8, r2 ; CHECK-NEXT: asrl r2, r3, #31 ; CHECK-NEXT: rsbs.w r6, r2, #-2147483648 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: sbcs.w r6, r9, r3 +; CHECK-NEXT: mov.w r6, #-1 ; CHECK-NEXT: vmov q2[3], q2[1], r7, r3 +; CHECK-NEXT: sbcs r6, r3 ; CHECK-NEXT: csetm r6, lt ; CHECK-NEXT: bfi r5, r6, #8, #8 +; CHECK-NEXT: mvn r6, #-2147483648 ; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: mvn r5, #-2147483648 +; CHECK-NEXT: mov r5, r9 ; CHECK-NEXT: vpsel q2, q2, q0 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: subs r2, r2, r5 +; CHECK-NEXT: subs r2, r2, r6 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: csetm r2, lt ; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, r4, d5 -; CHECK-NEXT: subs r2, r2, r5 +; CHECK-NEXT: subs r2, r2, r6 ; CHECK-NEXT: sbcs r2, r4, #0 ; CHECK-NEXT: csetm r2, lt ; CHECK-NEXT: bfi r3, r2, #8, #8 @@ -71,7 +75,8 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vpsel q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: strd r3, r2, [r11], #8 +; CHECK-NEXT: strd r3, r2, [r5] +; CHECK-NEXT: add.w r5, r9, #8 ; CHECK-NEXT: le lr, .LBB0_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload @@ -85,7 +90,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r2, [r12], #4 -; CHECK-NEXT: ldr r4, [r8], #4 +; CHECK-NEXT: ldr r4, [r11], #4 ; CHECK-NEXT: smull r2, r5, r4, r2 ; CHECK-NEXT: asrl r2, r5, #31 ; CHECK-NEXT: subs r4, r1, r2 @@ -107,6 +112,11 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .long 4294967295 @ 0xffffffff ; CHECK-NEXT: .long 2147483648 @ 0x80000000 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 entry: switch i32 %N, label %vector.ph [ i32 0, label %for.cond.cleanup @@ -603,56 +613,57 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: mov r8, r2 ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r11, r1 -; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB3_6 ; CHECK-NEXT: .LBB3_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #1 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bic r3, r3, #1 +; CHECK-NEXT: subs r7, r3, #2 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: subs r7, r5, #2 -; CHECK-NEXT: str r5, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r2, r8, r5, lsl #2 -; CHECK-NEXT: add.w r11, r1, r5, lsl #2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r11, r2, r3, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 -; CHECK-NEXT: add.w r12, r0, r5, lsl #2 -; CHECK-NEXT: vmov.i8 q0, #0xff +; CHECK-NEXT: add.w r10, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r9, [r0], #8 -; CHECK-NEXT: ldrd r5, r10, [r1], #8 -; CHECK-NEXT: umull r4, r5, r5, r4 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r6, r5, #0, #8 -; CHECK-NEXT: umull r10, r5, r10, r9 -; CHECK-NEXT: lsrl r10, r5, #31 -; CHECK-NEXT: subs.w r7, r10, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r10 -; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: ldrd r4, r6, [r0], #8 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: ldrd r7, r3, [r1], #8 +; CHECK-NEXT: umull r4, r9, r7, r4 +; CHECK-NEXT: lsrl r4, r9, #31 +; CHECK-NEXT: subs.w r5, r4, #-1 +; CHECK-NEXT: sbcs r5, r9, #0 ; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r6, r5, #8, #8 -; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: bfi r8, r5, #0, #8 +; CHECK-NEXT: umull r6, r5, r3, r6 +; CHECK-NEXT: lsrl r6, r5, #31 +; CHECK-NEXT: subs.w r7, r6, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r6 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r9, r5 +; CHECK-NEXT: csetm r3, lo +; CHECK-NEXT: bfi r8, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r8 ; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: strd r5, r4, [r8], #8 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: strd r4, r3, [r2], #8 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader @@ -660,17 +671,17 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .LBB3_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r1, [r11], #4 +; CHECK-NEXT: ldr r1, [r10], #4 ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: lsrl r0, r1, #31 -; CHECK-NEXT: subs.w r3, r0, #-1 +; CHECK-NEXT: subs.w r2, r0, #-1 ; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r2], #4 +; CHECK-NEXT: str r0, [r11], #4 ; CHECK-NEXT: le lr, .LBB3_7 ; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: switch i32 %N, label %vector.ph [ @@ -750,69 +761,78 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB4_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB4_3 ; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_3: @ %vector.ph -; CHECK-NEXT: bic r8, r3, #3 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r7, r3, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: sub.w r7, r8, #4 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: add.w r11, r2, r8, lsl #2 -; CHECK-NEXT: add.w r9, r1, r8, lsl #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r10, r1, r3, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: add.w r12, r0, r8, lsl #2 +; CHECK-NEXT: add.w r7, r2, r3, lsl #2 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmullb.u32 q5, q4, q3 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r10, r5, d10 -; CHECK-NEXT: lsrl r10, r5, #31 +; CHECK-NEXT: vmov r4, r9, d10 +; CHECK-NEXT: lsrl r4, r9, #31 ; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: subs.w r6, r10, #-1 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: csetm r5, lo +; CHECK-NEXT: subs.w r5, r4, #-1 +; CHECK-NEXT: sbcs r5, r9, #0 ; CHECK-NEXT: vmullb.u32 q4, q2, q1 -; CHECK-NEXT: bfi r6, r5, #0, #8 -; CHECK-NEXT: vmov r4, r5, d11 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r7, r4, #-1 -; CHECK-NEXT: vmov q3[2], q3[0], r10, r4 -; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r6, r5, #8, #8 -; CHECK-NEXT: vmov r10, r5, d8 -; CHECK-NEXT: lsrl r10, r5, #31 +; CHECK-NEXT: bfi r6, r5, #0, #8 +; CHECK-NEXT: vmov r8, r5, d11 +; CHECK-NEXT: lsrl r8, r5, #31 +; CHECK-NEXT: subs.w r11, r8, #-1 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r8 +; CHECK-NEXT: sbcs r7, r5, #0 +; CHECK-NEXT: vmov q3[3], q3[1], r9, r5 +; CHECK-NEXT: csetm r7, lo +; CHECK-NEXT: bfi r6, r7, #8, #8 +; CHECK-NEXT: vmov r4, r7, d8 +; CHECK-NEXT: lsrl r4, r7, #31 ; CHECK-NEXT: vmsr p0, r6 -; CHECK-NEXT: subs.w r6, r10, #-1 -; CHECK-NEXT: vpsel q3, q3, q0 -; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: subs.w r5, r4, #-1 ; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: sbcs r5, r7, #0 +; CHECK-NEXT: vpsel q3, q3, q0 ; CHECK-NEXT: csetm r5, lo ; CHECK-NEXT: bfi r6, r5, #0, #8 -; CHECK-NEXT: vmov r4, r5, d9 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r7, r4, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r10, r4 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r6, r5, #8, #8 +; CHECK-NEXT: vmov r2, r5, d9 +; CHECK-NEXT: lsrl r2, r5, #31 +; CHECK-NEXT: subs.w r3, r2, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 +; CHECK-NEXT: csetm r3, lo +; CHECK-NEXT: bfi r6, r3, #8, #8 ; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vpsel q1, q1, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s12 @@ -820,23 +840,26 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB4_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r8, r3 +; CHECK-NEXT: ldrd r7, r3, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB4_8 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r8 +; CHECK-NEXT: sub.w lr, r3, r7 ; CHECK-NEXT: .LBB4_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r1, [r9], #4 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: lsrl r0, r1, #31 +; CHECK-NEXT: ldr r2, [r10], #4 +; CHECK-NEXT: umull r0, r3, r2, r0 +; CHECK-NEXT: lsrl r0, r3, #31 ; CHECK-NEXT: subs.w r2, r0, #-1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r11], #4 +; CHECK-NEXT: str r0, [r1], #4 ; CHECK-NEXT: le lr, .LBB4_7 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll index 87df13787c6c8..d49973a674a21 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -35,15 +35,10 @@ entry: define arm_aapcs_vfpcc void @unscaled_v2i8_i8(ptr %base, ptr %offptr, <2 x i8> %input) { ; CHECK-LABEL: unscaled_v2i8_i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: ldrb r2, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strb r2, [r0, r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: strb r3, [r0, r2] ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: strb r2, [r0, r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index f9948db66b3b3..d2d3912fec65c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -443,7 +443,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0xffff +; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s4 @@ -1363,7 +1363,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s4 @@ -1870,7 +1870,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q2, #0xffff +; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s4 @@ -2544,7 +2544,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index 63b1431ac0fa4..76a15f4459afe 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -393,7 +393,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q3, #0xffff +; CHECK-NEXT: vmov.i64 q3, #0xffff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s4 @@ -1587,7 +1587,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vmov.i64 q3, #0xff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s4 @@ -2020,7 +2020,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q3, #0xffff +; CHECK-NEXT: vmov.i64 q3, #0xffff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s8 @@ -2915,7 +2915,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vmov.i64 q3, #0xff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s8 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index a8574c0b7516c..5aac1554e6e3b 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6625,7 +6625,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 @@ -6833,7 +6833,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index c1ef500d9d3de..c83514dbe7de2 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -5638,7 +5638,10 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mask_mul_epu32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] +; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmb: @@ -5657,9 +5660,12 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> ; X86-LABEL: test_mask_mul_epu32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x08] +; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A] +; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -5681,9 +5687,12 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) ; X86-LABEL: test_mask_mul_epu32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] +; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmbkz: @@ -7377,7 +7386,10 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mul_epu32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] +; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mul_epu32_rmb: @@ -7396,9 +7408,12 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass ; X86-LABEL: test_mul_epu32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x08] +; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A] +; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -7422,9 +7437,12 @@ define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) { ; X86-LABEL: test_mul_epu32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] +; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mul_epu32_rmbkz: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index e8e22bae23c92..4ef485b916fe4 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -11548,7 +11548,11 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mask_mul_epu32_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; X86-NEXT: vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a] +; X86-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmb_128: @@ -11567,9 +11571,13 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x ; X86-LABEL: test_mask_mul_epu32_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x08] +; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendd $10, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x02,0xd3,0x0a] +; X86-NEXT: # xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xca] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11591,9 +11599,13 @@ define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, ptr %ptr_b, i8 % ; X86-LABEL: test_mask_mul_epu32_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; X86-NEXT: vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a] +; X86-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmbkz_128: @@ -11716,7 +11728,11 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mask_mul_epu32_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; X86-NEXT: vpblendd $170, %ymm2, %ymm1, %ymm1 # encoding: [0xc4,0xe3,0x75,0x02,0xca,0xaa] +; X86-NEXT: # ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmb_256: @@ -11735,9 +11751,13 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x ; X86-LABEL: test_mask_mul_epu32_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x08] +; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendd $170, %ymm3, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x02,0xd3,0xaa] +; X86-NEXT: # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xca] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11759,9 +11779,13 @@ define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, ptr %ptr_b, i8 % ; X86-LABEL: test_mask_mul_epu32_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x00] +; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; X86-NEXT: vpblendd $170, %ymm2, %ymm1, %ymm1 # encoding: [0xc4,0xe3,0x75,0x02,0xca,0xaa] +; X86-NEXT: # ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmbkz_256: diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index bed8d5fcb1869..384a4c8f889ad 100644 --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -2698,8 +2698,9 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2723,8 +2724,9 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) lo ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2751,8 +2753,9 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2781,8 +2784,9 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2810,8 +2814,9 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2838,8 +2843,9 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7496,8 +7502,9 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7521,8 +7528,9 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7549,8 +7557,9 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7579,8 +7588,9 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7608,8 +7618,9 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7636,8 +7647,9 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9639,7 +9651,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9665,7 +9677,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9692,7 +9704,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9722,7 +9734,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9752,7 +9764,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9778,7 +9790,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9805,7 +9817,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9835,7 +9847,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9866,7 +9878,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -9899,7 +9911,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -9933,7 +9945,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9973,7 +9985,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10013,7 +10025,7 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10041,7 +10053,7 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10070,7 +10082,7 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -10102,7 +10114,7 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -10133,7 +10145,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10159,7 +10171,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10186,7 +10198,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -10216,7 +10228,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -10246,7 +10258,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10272,7 +10284,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10299,7 +10311,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -10329,7 +10341,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -10360,7 +10372,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10387,7 +10399,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10415,7 +10427,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10446,7 +10458,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10477,7 +10489,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10504,7 +10516,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10532,7 +10544,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10563,7 +10575,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10594,14 +10606,14 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 -; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10631,13 +10643,13 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, ptr %__b ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm1 -; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10667,7 +10679,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 -; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10676,7 +10688,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -10710,7 +10722,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm1 -; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -10719,7 +10731,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm1 ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -12354,8 +12366,9 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12379,8 +12392,9 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12407,8 +12421,9 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12437,8 +12452,9 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12466,8 +12482,9 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12494,8 +12511,9 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14498,7 +14516,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14524,7 +14542,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub (%rdi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14552,7 +14570,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14582,7 +14600,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub (%rsi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14613,7 +14631,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14639,7 +14657,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub (%rdi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14667,7 +14685,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14697,7 +14715,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub (%rsi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14729,7 +14747,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -14762,7 +14780,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub (%rdi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -14797,7 +14815,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14837,7 +14855,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxub (%rsi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14878,7 +14896,7 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14906,7 +14924,7 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -14936,7 +14954,7 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -14968,7 +14986,7 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -15000,7 +15018,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15026,7 +15044,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15054,7 +15072,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -15084,7 +15102,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -15115,7 +15133,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15141,7 +15159,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15169,7 +15187,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -15199,7 +15217,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1 ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} @@ -15231,7 +15249,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15258,7 +15276,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15287,7 +15305,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15318,7 +15336,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15350,7 +15368,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15377,7 +15395,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15406,7 +15424,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15437,7 +15455,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15469,7 +15487,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 ; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -15477,7 +15495,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15506,14 +15524,14 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, ptr %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1 -; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15544,7 +15562,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 ; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15554,7 +15572,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -15588,7 +15606,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1 -; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -15597,7 +15615,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpmaxuw 32(%rsi), %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx @@ -17232,8 +17250,9 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17257,8 +17276,9 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17285,8 +17305,9 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17315,8 +17336,9 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17344,8 +17366,9 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17372,8 +17395,9 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21057,8 +21081,9 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21082,8 +21107,9 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovapd (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21107,8 +21133,9 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b) ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21136,8 +21163,9 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i6 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21165,8 +21193,9 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21194,8 +21223,9 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, < ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll index 921cf88518562..de030f1b78d3d 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -195,7 +195,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 ; SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 ; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; SSSE3-NEXT: movmskpd %xmm2, %eax ; SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSSE3-NEXT: retq @@ -342,21 +342,25 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll index f21c3f7043e69..d050a7f66104c 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -157,7 +157,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; SSSE3-LABEL: v2i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; SSSE3-NEXT: movmskpd %xmm0, %eax ; SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSSE3-NEXT: retq @@ -272,10 +272,12 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll index 58bf0b607eb52..97184e68d2953 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -615,13 +615,15 @@ define void @bitcast_8i64_store(ptr %p, <8 x i64> %a0) { ; ; AVX1-LABEL: bitcast_8i64_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movb %al, (%rdi) @@ -630,6 +632,9 @@ define void @bitcast_8i64_store(ptr %p, <8 x i64> %a0) { ; ; AVX2-LABEL: bitcast_8i64_store: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..74ff87911d81d 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -634,7 +634,11 @@ define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind { define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; SSE-LABEL: bitcast_v8i64_to_v2i4: ; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 @@ -648,13 +652,15 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: bitcast_v8i64_to_v2i4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movl %eax, %ecx @@ -667,6 +673,9 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; ; AVX2-LABEL: bitcast_v8i64_to_v2i4: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax diff --git a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll index 345014edd0e9d..4c5e9225e1447 100644 --- a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll +++ b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll @@ -266,16 +266,18 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: paddq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 +; SSE2-NEXT: paddq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; @@ -319,27 +321,26 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE2-LABEL: dot_ext_v4i16_v4i32: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: leaq (%rsi,%rsi,2), %rcx ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 ; SSE2-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0 -; SSE2-NEXT: leaq (%rsi,%rsi,2), %rax -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pinsrw $6, (%rdi,%rax), %xmm1 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pinsrw $6, (%rdi,%rcx), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: dot_ext_v4i16_v4i32: @@ -402,16 +403,16 @@ define i32 @dot_ext_v2i16_v2i32(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: dot_ext_v2i16_v2i32: @@ -461,15 +462,17 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 +; SSE2-NEXT: paddq %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 70335f834291d..cfac763acd04b 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -44,13 +44,43 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: combine_shuffle_zero_pmuludq: ; SSE: # %bb.0: -; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_shuffle_zero_pmuludq: -; AVX: # %bb.0: -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_shuffle_zero_pmuludq: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_shuffle_zero_pmuludq: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: combine_shuffle_zero_pmuludq: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> %3 = bitcast <4 x i32> %1 to <2 x i64> @@ -62,12 +92,20 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { ; SSE-LABEL: combine_shuffle_zero_pmuludq_256: ; SSE: # %bb.0: -; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] ; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE-NEXT: pmuludq %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -77,16 +115,25 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) ; ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> @@ -100,29 +147,33 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { ; SSE-LABEL: combine_zext_pmuludq_256: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pmovsxdq {{.*#+}} xmm4 = [715827883,715827883] -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: pmuludq %xmm4, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pmovsxdq {{.*#+}} xmm3 = [715827883,715827883] +; SSE-NEXT: pmuludq %xmm3, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_zext_pmuludq_256: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [715827883,715827883] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1ae1d61091362..045979afc1f53 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1538,7 +1538,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: psrlq $62, %xmm1 ; SSE2-NEXT: paddq %xmm0, %xmm1 @@ -1552,7 +1552,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: psrlq $62, %xmm1 ; SSE41-NEXT: paddq %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: psrlq $62, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm2 @@ -1632,7 +1632,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrlq $61, %xmm3 @@ -1650,7 +1650,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: psrlq $62, %xmm2 ; SSE41-NEXT: paddq %xmm0, %xmm2 @@ -1659,7 +1659,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; SSE41-NEXT: psrlq $2, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlq $60, %xmm3 @@ -1755,7 +1755,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: psrlq $62, %xmm4 ; SSE2-NEXT: paddq %xmm0, %xmm4 @@ -1765,7 +1765,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: psrlq $62, %xmm4 ; SSE2-NEXT: paddq %xmm2, %xmm4 @@ -1775,7 +1775,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrlq $61, %xmm5 @@ -1789,7 +1789,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] ; SSE2-NEXT: xorpd %xmm4, %xmm1 ; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: psrlq $61, %xmm6 @@ -1806,7 +1806,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: psrlq $62, %xmm4 ; SSE41-NEXT: paddq %xmm0, %xmm4 @@ -1815,7 +1815,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE41-NEXT: psrlq $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: psrlq $62, %xmm4 ; SSE41-NEXT: paddq %xmm2, %xmm4 @@ -1824,7 +1824,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE41-NEXT: psrlq $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrlq $60, %xmm5 @@ -1838,7 +1838,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] ; SSE41-NEXT: pxor %xmm4, %xmm1 ; SSE41-NEXT: psubq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psrlq $60, %xmm6 @@ -2187,14 +2187,15 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: paddw %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] +; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7] -; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: paddb %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -2222,15 +2223,15 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7] -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index c982884314f62..fa5c5ecded124 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -724,46 +724,47 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; SSE41-LABEL: combine_vec4i64_ashr_clamped: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519] -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [63,63] -; SSE41-NEXT: movapd %xmm9, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [63,63] +; SSE41-NEXT: movapd %xmm8, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm9, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrlq %xmm3, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrlq %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: psrlq %xmm3, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm2[0,1,2,3],xmm6[4,5,6,7] ; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: psrlq %xmm9, %xmm2 +; SSE41-NEXT: psrlq %xmm8, %xmm2 ; SSE41-NEXT: psrlq %xmm3, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: psubq %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm4 +; SSE41-NEXT: psubq %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE41-NEXT: psrlq %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] ; SSE41-NEXT: psrlq %xmm3, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlq %xmm6, %xmm2 +; SSE41-NEXT: psrlq %xmm5, %xmm2 ; SSE41-NEXT: psrlq %xmm3, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pxor %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 55715197830b1..6230c883cd7c5 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -631,7 +631,10 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $15, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: psrlw $7, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll index 1886e2911ede8..cf56effeb348c 100644 --- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll @@ -40,6 +40,8 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; X86-LABEL: test_cvtss_sh: ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X86-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -47,6 +49,8 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; ; X64-LABEL: test_cvtss_sh: ; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index 989aabc9e87bd..ad8dcf5083106 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -1258,19 +1258,19 @@ define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) { ; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: orps %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: minpd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 ; SSE2-NEXT: andnpd %xmm1, %xmm0 -; SSE2-NEXT: orpd %xmm3, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimum_vector_different_zeros: diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index 5945bae94f452..4ab3cda816fd1 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -1252,19 +1252,19 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: orps %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: minpd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordpd %xmm3, %xmm0 -; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpordpd %xmm2, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 ; SSE2-NEXT: andnpd %xmm1, %xmm0 -; SSE2-NEXT: orpd %xmm3, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_different_zeros: diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 59a61722927de..e4df7e8d8877c 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -141,30 +141,26 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-LABEL: fmul_pow2_8xhalf: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: subq $104, %rsp -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112 -; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; CHECK-SSE-NEXT: pslld $23, %xmm1 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; CHECK-SSE-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: pslld $16, %xmm1 -; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-SSE-NEXT: subq $120, %rsp +; CHECK-SSE-NEXT: .cfi_def_cfa_offset 128 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; CHECK-SSE-NEXT: pslld $23, %xmm2 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-SSE-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE-NEXT: cvttps2dq %xmm2, %xmm2 +; CHECK-SSE-NEXT: pslld $16, %xmm2 +; CHECK-SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd %xmm2, %xmm0 +; CHECK-SSE-NEXT: paddd %xmm3, %xmm0 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: pslld $16, %xmm0 ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: psrld $16, %xmm0 -; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrlq $48, %xmm0 +; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -173,19 +169,18 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0 +; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrld $16, %xmm0 +; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-SSE-NEXT: psrlq $48, %xmm0 ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrlq $48, %xmm0 +; CHECK-SSE-NEXT: psrld $16, %xmm0 +; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -194,9 +189,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: callq __truncsfhf2@PLT +; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SSE-NEXT: psrlq $48, %xmm0 ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT @@ -208,9 +205,9 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT @@ -224,23 +221,23 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT @@ -254,11 +251,12 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-SSE-NEXT: addq $104, %rsp +; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE-NEXT: addq $120, %rsp ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; @@ -1055,7 +1053,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: subq $40, %rsp -; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-SSE-NEXT: pslld $23, %xmm0 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 0ca3380d188b7..2519c7e6a9720 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -17,32 +17,32 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; GFNISSE-NEXT: pslld $23, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] -; GFNISSE-NEXT: paddd %xmm6, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216] +; GFNISSE-NEXT: paddd %xmm7, %xmm2 ; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2 -; GFNISSE-NEXT: pslld $23, %xmm3 -; GFNISSE-NEXT: paddd %xmm6, %xmm3 -; GFNISSE-NEXT: cvttps2dq %xmm3, %xmm3 -; GFNISSE-NEXT: packusdw %xmm2, %xmm3 -; GFNISSE-NEXT: movdqa %xmm1, %xmm7 -; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; GFNISSE-NEXT: pmullw %xmm3, %xmm7 -; GFNISSE-NEXT: psrlw $8, %xmm7 +; GFNISSE-NEXT: pslld $23, %xmm6 +; GFNISSE-NEXT: paddd %xmm7, %xmm6 +; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm6 +; GFNISSE-NEXT: packusdw %xmm2, %xmm6 +; GFNISSE-NEXT: movdqa %xmm1, %xmm8 +; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; GFNISSE-NEXT: pmullw %xmm6, %xmm8 +; GFNISSE-NEXT: psrlw $8, %xmm8 ; GFNISSE-NEXT: pslld $23, %xmm4 -; GFNISSE-NEXT: paddd %xmm6, %xmm4 +; GFNISSE-NEXT: paddd %xmm7, %xmm4 ; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm2 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; GFNISSE-NEXT: pslld $23, %xmm5 -; GFNISSE-NEXT: paddd %xmm6, %xmm5 +; GFNISSE-NEXT: paddd %xmm7, %xmm5 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm3 ; GFNISSE-NEXT: packusdw %xmm3, %xmm2 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNISSE-NEXT: pmullw %xmm1, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm2 -; GFNISSE-NEXT: packuswb %xmm7, %xmm2 +; GFNISSE-NEXT: packuswb %xmm8, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0 ; GFNISSE-NEXT: retq ; @@ -50,34 +50,34 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vcvttps2dq %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 +; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 +; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: var_fshl_v16i8: @@ -541,7 +541,7 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: movdqa %xmm4, %xmm10 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; GFNISSE-NEXT: pslld $23, %xmm10 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] ; GFNISSE-NEXT: paddd %xmm4, %xmm10 @@ -557,7 +557,7 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: pslld $23, %xmm0 ; GFNISSE-NEXT: paddd %xmm4, %xmm0 ; GFNISSE-NEXT: cvttps2dq %xmm0, %xmm0 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; GFNISSE-NEXT: pslld $23, %xmm9 ; GFNISSE-NEXT: paddd %xmm4, %xmm9 ; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm9 @@ -570,23 +570,23 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; GFNISSE-NEXT: pslld $23, %xmm5 ; GFNISSE-NEXT: paddd %xmm4, %xmm5 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5 -; GFNISSE-NEXT: pslld $23, %xmm7 -; GFNISSE-NEXT: paddd %xmm4, %xmm7 -; GFNISSE-NEXT: cvttps2dq %xmm7, %xmm7 -; GFNISSE-NEXT: packusdw %xmm5, %xmm7 +; GFNISSE-NEXT: pslld $23, %xmm8 +; GFNISSE-NEXT: paddd %xmm4, %xmm8 +; GFNISSE-NEXT: cvttps2dq %xmm8, %xmm8 +; GFNISSE-NEXT: packusdw %xmm5, %xmm8 ; GFNISSE-NEXT: movdqa %xmm3, %xmm5 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; GFNISSE-NEXT: pmullw %xmm7, %xmm5 +; GFNISSE-NEXT: pmullw %xmm8, %xmm5 ; GFNISSE-NEXT: psrlw $8, %xmm5 ; GFNISSE-NEXT: pslld $23, %xmm2 ; GFNISSE-NEXT: paddd %xmm4, %xmm2 ; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; GFNISSE-NEXT: pslld $23, %xmm6 ; GFNISSE-NEXT: paddd %xmm4, %xmm6 ; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm4 @@ -601,17 +601,17 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-LABEL: var_fshl_v32i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; GFNIAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm7 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm7 +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 @@ -619,48 +619,48 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9 ; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4 -; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 +; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 +; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 +; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: var_fshl_v32i8: @@ -1365,15 +1365,15 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm1, %xmm8 ; GFNISSE-NEXT: movdqa %xmm0, %xmm1 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; GFNISSE-NEXT: pand %xmm9, %xmm0 -; GFNISSE-NEXT: pxor %xmm10, %xmm10 +; GFNISSE-NEXT: pand %xmm10, %xmm0 +; GFNISSE-NEXT: pxor %xmm9, %xmm9 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm0 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [1065353216,1065353216,1065353216,1065353216] ; GFNISSE-NEXT: paddd %xmm11, %xmm0 @@ -1389,7 +1389,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: pslld $23, %xmm12 ; GFNISSE-NEXT: paddd %xmm11, %xmm12 ; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm0 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm13 ; GFNISSE-NEXT: paddd %xmm11, %xmm13 ; GFNISSE-NEXT: cvttps2dq %xmm13, %xmm12 @@ -1399,12 +1399,12 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm15, %xmm0 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; GFNISSE-NEXT: pand %xmm9, %xmm1 +; GFNISSE-NEXT: pand %xmm10, %xmm1 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm1 ; GFNISSE-NEXT: paddd %xmm11, %xmm1 ; GFNISSE-NEXT: cvttps2dq %xmm1, %xmm1 @@ -1419,7 +1419,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: pslld $23, %xmm4 ; GFNISSE-NEXT: paddd %xmm11, %xmm4 ; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm1 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm12 ; GFNISSE-NEXT: paddd %xmm11, %xmm12 ; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm4 @@ -1429,12 +1429,12 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: packuswb %xmm14, %xmm1 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; GFNISSE-NEXT: pand %xmm9, %xmm4 +; GFNISSE-NEXT: pand %xmm10, %xmm4 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm4 ; GFNISSE-NEXT: paddd %xmm11, %xmm4 ; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm4 @@ -1449,7 +1449,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: pslld $23, %xmm5 ; GFNISSE-NEXT: paddd %xmm11, %xmm5 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm4 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm8 ; GFNISSE-NEXT: paddd %xmm11, %xmm8 ; GFNISSE-NEXT: cvttps2dq %xmm8, %xmm5 @@ -1458,15 +1458,15 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: pmullw %xmm6, %xmm4 ; GFNISSE-NEXT: psrlw $8, %xmm4 ; GFNISSE-NEXT: packuswb %xmm13, %xmm4 -; GFNISSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm9 -; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero -; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; GFNISSE-NEXT: pslld $23, %xmm9 -; GFNISSE-NEXT: paddd %xmm11, %xmm9 -; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm8 +; GFNISSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm10 +; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; GFNISSE-NEXT: pslld $23, %xmm10 +; GFNISSE-NEXT: paddd %xmm11, %xmm10 +; GFNISSE-NEXT: cvttps2dq %xmm10, %xmm8 ; GFNISSE-NEXT: pslld $23, %xmm5 ; GFNISSE-NEXT: paddd %xmm11, %xmm5 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5 @@ -1478,7 +1478,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: pslld $23, %xmm2 ; GFNISSE-NEXT: paddd %xmm11, %xmm2 ; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm5 -; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] ; GFNISSE-NEXT: pslld $23, %xmm6 ; GFNISSE-NEXT: paddd %xmm11, %xmm6 ; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm2 @@ -1496,16 +1496,16 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX1-NEXT: vandps %ymm7, %ymm4, %ymm8 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm8, %xmm9 -; GFNIAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm11 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11 +; GFNIAVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm11 +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm11 ; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 @@ -1515,26 +1515,26 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero ; GFNIAVX1-NEXT: vpslld $23, %xmm13, %xmm13 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm13, %xmm13 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm13, %xmm13 ; GFNIAVX1-NEXT: vcvttps2dq %xmm13, %xmm13 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm9 ; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9 ; GFNIAVX1-NEXT: vpackusdw %xmm9, %xmm13, %xmm9 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; GFNIAVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm9 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm9, %xmm9 ; GFNIAVX1-NEXT: vpackuswb %xmm10, %xmm9, %xmm9 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm11 ; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] @@ -1542,12 +1542,12 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero ; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm11 ; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8 ; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm11, %xmm8 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -1557,14 +1557,14 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vandps %ymm7, %ymm5, %ymm2 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8 ; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 @@ -1574,45 +1574,45 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm10, %xmm5 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm8, %xmm5 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 -; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 +; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 +; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2 +; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm8, %xmm2 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 1a2aac657d30f..eff6bcfe570a1 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -542,16 +542,14 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -567,16 +565,14 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq @@ -622,16 +618,14 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -647,16 +641,14 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index 63336ffa7c6c8..f6b8839afb2d7 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -1243,7 +1243,8 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) { define i32 @bitcast_known_nonzero(<2 x i16> %xx) { ; X86-LABEL: bitcast_known_nonzero: ; X86: # %bb.0: -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: pslld $23, %xmm0 ; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: cvttps2dq %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll index e183bbc15617d..be4605c007092 100644 --- a/llvm/test/CodeGen/X86/known-pow2.ll +++ b/llvm/test/CodeGen/X86/known-pow2.ll @@ -30,12 +30,12 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) { ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: psrld $1, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-NEXT: movdqa %xmm1, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll index 473fecc307ed4..295a2eab029ec 100644 --- a/llvm/test/CodeGen/X86/known-signbits-shl.ll +++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll @@ -70,7 +70,8 @@ define void @computeNumSignBits_shl_zext_vec_1(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: psubb %xmm1, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2048,8192,u,u,u,u,u,u] ; X64-NEXT: movd %xmm0, (%rdi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 45b61155fe626..5368934fa5bf1 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -192,8 +192,10 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind { ; X86-LABEL: signbits_ashr_shl_extract_sitofp: ; X86: # %bb.0: ; X86-NEXT: pushl %eax +; X86-NEXT: vpsrad $31, %xmm0, %xmm1 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-NEXT: vpsrad $29, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-NEXT: vpsllq $20, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) @@ -201,13 +203,25 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; -; X64-LABEL: signbits_ashr_shl_extract_sitofp: -; X64: # %bb.0: -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vpsllq $20, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 -; X64-NEXT: retq +; X64-AVX1-LABEL: signbits_ashr_shl_extract_sitofp: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX1-NEXT: vpsrad $29, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X64-AVX1-NEXT: vpsllq $20, %xmm0, %xmm0 +; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: signbits_ashr_shl_extract_sitofp: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX2-NEXT: vpsrad $29, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X64-AVX2-NEXT: vpsllq $20, %xmm0, %xmm0 +; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shl <2 x i64> %1, %3 = extractelement <2 x i64> %2, i32 0 @@ -459,8 +473,10 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x ; ; X64-AVX2-LABEL: signbits_ashr_sext_select_shuffle_sitofp: ; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrad $31, %ymm2, %ymm4 ; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] ; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index c7320275091c6..f4f7f44038153 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -142,8 +142,12 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val) ; ; SSE4-LABEL: store_v4f64_v4i64: ; SSE4: ## %bb.0: -; SSE4-NEXT: packssdw %xmm1, %xmm0 -; SSE4-NEXT: movmskps %xmm0, %eax +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: pxor %xmm5, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: movmskps %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne LBB2_1 ; SSE4-NEXT: ## %bb.2: ## %else @@ -1036,8 +1040,12 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) no ; ; SSE4-LABEL: store_v4i64_v4i64: ; SSE4: ## %bb.0: -; SSE4-NEXT: packssdw %xmm1, %xmm0 -; SSE4-NEXT: movmskps %xmm0, %eax +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: pxor %xmm5, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: movmskps %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne LBB8_1 ; SSE4-NEXT: ## %bb.2: ## %else @@ -6170,7 +6178,7 @@ define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind { ; AVX2-LABEL: undefshuffle: ; AVX2: ## %bb.0: ; AVX2-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,2,u,u,u,4,u,u,u,6,u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u],zero,zero,ymm0[u,u],zero,zero,ymm0[u,u],zero,zero,ymm0[u,u],zero,zero,ymm0[u,u] ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 9b624a935bada..7f50cac5e4290 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -824,7 +824,11 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) { define i1 @allones_v8i64_sign(<8 x i64> %arg) { ; SSE-LABEL: allones_v8i64_sign: ; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 @@ -835,11 +839,16 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) { ; ; AVX1-LABEL: allones_v8i64_sign: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al @@ -848,6 +857,9 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) { ; ; AVX2-LABEL: allones_v8i64_sign: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vtestps %ymm1, %ymm0 @@ -881,21 +893,38 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) { define i1 @allzeros_v8i64_sign(<8 x i64> %arg) { ; SSE-LABEL: allzeros_v8i64_sign: ; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE-NEXT: packssdw %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: packssdw %xmm2, %xmm4 +; SSE-NEXT: packssdw %xmm3, %xmm4 +; SSE-NEXT: pmovmskb %xmm4, %eax ; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_sign: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper @@ -903,6 +932,9 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) { ; ; AVX2-LABEL: allzeros_v8i64_sign: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al @@ -1968,6 +2000,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) { ; SSE-LABEL: allones_v2i64_and1: ; SSE: # %bb.0: ; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: movmskpd %xmm0, %eax ; SSE-NEXT: cmpl $3, %eax ; SSE-NEXT: sete %al @@ -2151,11 +2184,15 @@ define i1 @allzeros_v4i64_and1(<4 x i64> %arg) { define i1 @allones_v8i64_and1(<8 x i64> %arg) { ; SSE-LABEL: allones_v8i64_and1: ; SSE: # %bb.0: -; SSE-NEXT: psllq $63, %xmm3 -; SSE-NEXT: psllq $63, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE-NEXT: pslld $31, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE-NEXT: pslld $31, %xmm2 ; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psllq $63, %xmm1 -; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE-NEXT: pslld $31, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 @@ -2167,12 +2204,17 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) { ; AVX1-LABEL: allones_v8i64_and1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm2 -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -2183,8 +2225,11 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) { ; ; AVX2-LABEL: allones_v8i64_and1: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $63, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vtestps %ymm1, %ymm0 @@ -3169,6 +3214,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) { ; SSE-LABEL: allones_v2i64_and4: ; SSE: # %bb.0: ; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: movmskpd %xmm0, %eax ; SSE-NEXT: cmpl $3, %eax ; SSE-NEXT: sete %al @@ -3353,10 +3399,14 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) { ; SSE-LABEL: allones_v8i64_and4: ; SSE: # %bb.0: ; SSE-NEXT: psllq $61, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: psllq $61, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: psllq $61, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 @@ -3368,12 +3418,17 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) { ; AVX1-LABEL: allones_v8i64_and4: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm2 -; AVX1-NEXT: vpsllq $61, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -3384,8 +3439,11 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) { ; ; AVX2-LABEL: allones_v8i64_and4: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $61, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vtestps %ymm1, %ymm0 @@ -4098,7 +4156,7 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) { ; KNL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $7, %edi @@ -4143,7 +4201,7 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax @@ -4200,7 +4258,7 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax @@ -4247,7 +4305,7 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpeq_uqps %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax @@ -4291,7 +4349,7 @@ define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmplepd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax @@ -4485,6 +4543,7 @@ define i32 @pr67287(<2 x i64> %broadcast.splatinsert25) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: setne %al diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll index bbda4d68bb685..e5b0b11204e85 100644 --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -145,13 +145,14 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] -; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: pmuludq %xmm1, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm1 +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE42-NEXT: pmuludq %xmm0, %xmm1 +; SSE42-NEXT: pmuludq %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll index 9e398096bfcc5..d5ceff5709974 100644 --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -130,18 +130,21 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; SSE2-LABEL: p5_vector_urem_by_const__nonsplat: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -149,7 +152,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = [1,2147483648] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2147483648,1] ; SSE4-NEXT: pmuludq %xmm0, %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; SSE4-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index c7cc2acaf2627..5e7f8827b9c85 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1007,13 +1007,15 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; ; SSE41-LABEL: mul_v4i64_zero_upper: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pmuludq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE41-NEXT: pmuludq %xmm3, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE41-NEXT: pmuludq %xmm0, %xmm1 +; SSE41-NEXT: pmuludq %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v4i64_zero_upper: @@ -1162,20 +1164,23 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero -; SSE41-NEXT: pmuludq %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] -; SSE41-NEXT: pmuludq %xmm5, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: pmuludq %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] -; SSE41-NEXT: pmuludq %xmm7, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE41-NEXT: pmuludq %xmm0, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE41-NEXT: pmuludq %xmm1, %xmm3 +; SSE41-NEXT: pmuludq %xmm7, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE41-NEXT: pmuludq %xmm8, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm5, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v8i64_zero_upper: @@ -1214,25 +1219,25 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] @@ -1240,40 +1245,43 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-NEXT: pxor %xmm13, %xmm13 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm12 -; SSE2-NEXT: paddq %xmm14, %xmm12 -; SSE2-NEXT: psllq $32, %xmm12 +; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm14 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] +; SSE2-NEXT: movdqa %xmm12, %xmm15 +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm15 +; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm14 +; SSE2-NEXT: paddq %xmm15, %xmm14 +; SSE2-NEXT: psllq $32, %xmm14 ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm9, %xmm11 -; SSE2-NEXT: paddq %xmm4, %xmm11 -; SSE2-NEXT: psllq $32, %xmm11 -; SSE2-NEXT: pmuludq %xmm9, %xmm1 -; SSE2-NEXT: paddq %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] +; SSE2-NEXT: paddq %xmm14, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE2-NEXT: pmuludq %xmm1, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm11, %xmm13 +; SSE2-NEXT: paddq %xmm12, %xmm13 +; SSE2-NEXT: psllq $32, %xmm13 +; SSE2-NEXT: pmuludq %xmm11, %xmm1 +; SSE2-NEXT: paddq %xmm13, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] ; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm6, %xmm9 -; SSE2-NEXT: paddq %xmm4, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm8, %xmm10 +; SSE2-NEXT: paddq %xmm4, %xmm10 +; SSE2-NEXT: psllq $32, %xmm10 +; SSE2-NEXT: pmuludq %xmm8, %xmm2 +; SSE2-NEXT: paddq %xmm10, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: pmuludq %xmm3, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm5, %xmm9 +; SSE2-NEXT: paddq %xmm6, %xmm9 ; SSE2-NEXT: psllq $32, %xmm9 -; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: paddq %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm6 -; SSE2-NEXT: paddq %xmm4, %xmm6 -; SSE2-NEXT: psllq $32, %xmm6 ; SSE2-NEXT: pmuludq %xmm5, %xmm3 -; SSE2-NEXT: paddq %xmm6, %xmm3 +; SSE2-NEXT: paddq %xmm9, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 300da68d9a3b3..b54029fa76553 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -54,9 +54,9 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: and_mulhuw_v4i16: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -65,6 +65,8 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512-LABEL: and_mulhuw_v4i16: ; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll index d5119d45f97c0..5c43276f0dc38 100644 --- a/llvm/test/CodeGen/X86/pr107423.ll +++ b/llvm/test/CodeGen/X86/pr107423.ll @@ -7,21 +7,21 @@ define void @PR107423(<64 x i8> %arg, ptr %p0) { ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpsllw $8, %xmm2, %xmm2 ; CHECK-NEXT: vpsllw $8, %xmm1, %xmm3 -; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm2 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm4 -; CHECK-NEXT: vpaddb %xmm1, %xmm4, %xmm1 -; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm4 -; CHECK-NEXT: vpsllw $8, %xmm4, %xmm4 -; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm4 +; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; CHECK-NEXT: vpsllw $8, %xmm1, %xmm1 -; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 16(%rdi) -; CHECK-NEXT: vmovdqu %xmm2, (%rdi) +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm2 +; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) +; CHECK-NEXT: vmovdqu %xmm1, 16(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %i3 = bitcast <64 x i8> %arg to <32 x i16> diff --git a/llvm/test/CodeGen/X86/pr35918.ll b/llvm/test/CodeGen/X86/pr35918.ll index 7e63b0abfae69..f57fab3084a9e 100644 --- a/llvm/test/CodeGen/X86/pr35918.ll +++ b/llvm/test/CodeGen/X86/pr35918.ll @@ -13,7 +13,7 @@ define void @fetch_r16g16_snorm_unorm8(ptr, ptr, i32, i32, ptr) nounwind { ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpsrlw $7, %xmm0, %xmm0 -; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u] +; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; X86-NEXT: vmovd %xmm0, %ecx ; X86-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 ; X86-NEXT: movl %ecx, (%eax) @@ -25,7 +25,7 @@ define void @fetch_r16g16_snorm_unorm8(ptr, ptr, i32, i32, ptr) nounwind { ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpsrlw $7, %xmm0, %xmm0 -; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u] +; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 ; X64-NEXT: movl %eax, (%rdi) diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll index ad778b4970cbf..5d11f1c960a8c 100644 --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -5,6 +5,8 @@ define void @foo(double %arg) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: movl %eax, (%rax) ; CHECK-NEXT: movq $0, (%rax) diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll index cf1fa5a8fc493..286015840d4c7 100644 --- a/llvm/test/CodeGen/X86/pr42727.ll +++ b/llvm/test/CodeGen/X86/pr42727.ll @@ -7,7 +7,7 @@ define void @_ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_(ptr byval ; CHECK-LABEL: _ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vpbroadcastd (%eax), %xmm1 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vpsllq $56, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%eax) diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll index 72877e1b1d67d..00430c835837f 100644 --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -39,7 +39,7 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) { ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm4 ; CHECK-NEXT: vblendvps %ymm1, %ymm4, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u] +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll index 04c342b6673ed..aa6a38a841f43 100644 --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -29,7 +29,7 @@ define void @mstore_split9(<9 x float> %value, ptr %addr, <9 x i1> %mask) { ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] +; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi) ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll index 96f6a18819383..248b48092ff1a 100644 --- a/llvm/test/CodeGen/X86/pr77459.ll +++ b/llvm/test/CodeGen/X86/pr77459.ll @@ -80,7 +80,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) { ; SSE42-LABEL: reverse_cmp_v8i1: ; SSE42: # %bb.0: ; SSE42-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] ; SSE42-NEXT: packsswb %xmm0, %xmm0 ; SSE42-NEXT: pmovmskb %xmm0, %eax ; SSE42-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll index 88934a382bbfa..fb0bf5ced3e51 100644 --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -27,33 +27,36 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) { ; SSE2-NEXT: andps %xmm10, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm7[1,3] ; SSE2-NEXT: orps %xmm4, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm0 +; SSE2-NEXT: pslld $31, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: PR45808: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm4 -; SSE4-NEXT: movdqa %xmm0, %xmm5 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm5 ; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE4-NEXT: movdqa %xmm4, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm2, %xmm5 +; SSE4-NEXT: packssdw %xmm0, %xmm5 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; SSE4-NEXT: pmovsxdq %xmm5, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE4-NEXT: psllq $63, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: movapd %xmm4, %xmm0 +; SSE4-NEXT: movapd %xmm2, %xmm0 ; SSE4-NEXT: movapd %xmm3, %xmm1 ; SSE4-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll index f28a2ad0fd009..df1e9d61e3ef4 100644 --- a/llvm/test/CodeGen/X86/promote-vec3.ll +++ b/llvm/test/CodeGen/X86/promote-vec3.ll @@ -18,9 +18,10 @@ define <3 x i16> @zext_i8(<3 x i8>) { ; ; SSE41-LABEL: zext_i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; SSE41-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax @@ -30,9 +31,10 @@ define <3 x i16> @zext_i8(<3 x i8>) { ; ; AVX-32-LABEL: zext_i8: ; AVX-32: # %bb.0: -; AVX-32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; AVX-32-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; AVX-32-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-32-NEXT: vmovd %xmm0, %eax ; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax @@ -42,9 +44,10 @@ define <3 x i16> @zext_i8(<3 x i8>) { ; ; AVX-64-LABEL: zext_i8: ; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 ; AVX-64-NEXT: movzbl %dl, %ecx ; AVX-64-NEXT: movzbl %sil, %edx -; AVX-64-NEXT: vmovd %edi, %xmm0 ; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 73ee28a7fd247..be32b3497933d 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1670,48 +1670,51 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm4, %xmm8 -; SSE41-NEXT: pxor %xmm9, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] -; SSE41-NEXT: movapd %xmm8, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: packusdw %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm7, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: packusdw %xmm3, %xmm8 -; SSE41-NEXT: packusdw %xmm4, %xmm8 -; SSE41-NEXT: psubusw %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: packusdw %xmm3, %xmm7 +; SSE41-NEXT: packusdw %xmm4, %xmm7 +; SSE41-NEXT: psubusw %xmm7, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: retq ; @@ -2766,52 +2769,55 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; ; SSE41-LABEL: test33: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm8, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm7 -; SSE41-NEXT: psubd %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm8, %xmm1 +; SSE41-NEXT: psubd %xmm8, %xmm1 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test33: @@ -2952,50 +2958,53 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm8, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm8, %xmm1 +; SSE41-NEXT: psubd %xmm8, %xmm1 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll index 1ead3f98ab5d6..36783d10552a5 100644 --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -149,19 +149,12 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { ; Result would undershift define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { -; X86-LABEL: no_extract_shl: -; X86: # %bb.0: -; X86-NEXT: vpsllq $24, %ymm0, %ymm1 -; X86-NEXT: vpsrlq $39, %ymm0, %ymm0 -; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0 -; X86-NEXT: retl -; -; X64-LABEL: no_extract_shl: -; X64: # %bb.0: -; X64-NEXT: vpsllq $24, %ymm0, %ymm1 -; X64-NEXT: vpsrlq $39, %ymm0, %ymm0 -; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; X64-NEXT: retq +; CHECK-LABEL: no_extract_shl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $24, %ymm0, %ymm1 +; CHECK-NEXT: vpsrlq $39, %ymm0, %ymm0 +; CHECK-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & mem) | ymm1 +; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = shl <4 x i64> %i, %rhs_mul = shl <4 x i64> %i, %lhs_shift = lshr <4 x i64> %lhs_mul, @@ -171,19 +164,12 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { ; Result would overshift define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { -; X86-LABEL: no_extract_shrl: -; X86: # %bb.0: -; X86-NEXT: vpsrld $9, %xmm0, %xmm1 -; X86-NEXT: vpslld $25, %xmm0, %xmm0 -; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0 -; X86-NEXT: retl -; -; X64-LABEL: no_extract_shrl: -; X64: # %bb.0: -; X64-NEXT: vpsrld $9, %xmm0, %xmm1 -; X64-NEXT: vpslld $25, %xmm0, %xmm0 -; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: no_extract_shrl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrld $9, %xmm0, %xmm1 +; CHECK-NEXT: vpslld $25, %xmm0, %xmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = (xmm0 & mem) | xmm1 +; CHECK-NEXT: ret{{[l|q]}} %lhs_div = lshr <4 x i32> %i, %rhs_div = lshr <4 x i32> %i, %lhs_shift = shl <4 x i32> %lhs_div, @@ -234,10 +220,12 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index bd563f97b0ac4..90c4d9a2b8ccc 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1175,22 +1175,25 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: paddq %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1352,40 +1355,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm6, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1650,74 +1659,86 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm10 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm7, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: pxor %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index b12be7cb129d3..69e6ff7770ebe 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -631,12 +631,13 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372034707292117,9223372034707292117] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1163,10 +1164,12 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64> ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372034707292159,9223372034707292159] ; SSE41-NEXT: pxor %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: paddq %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll index 456819179fcdc..41ae0dd9e8eb1 100644 --- a/llvm/test/CodeGen/X86/sdiv-exact.ll +++ b/llvm/test/CodeGen/X86/sdiv-exact.ll @@ -83,11 +83,12 @@ define <4 x i32> @test5(<4 x i32> %x) { ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145] +; X86-NEXT: movaps %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -108,11 +109,12 @@ define <4 x i32> @test6(<4 x i32> %x) { ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: psrad $1, %xmm0 ; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] -; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997] +; X86-NEXT: movapd %xmm0, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqa %xmm1, %xmm0 @@ -130,11 +132,12 @@ define <4 x i32> @test6(<4 x i32> %x) { define <4 x i32> @test7(<4 x i32> %x) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl ; @@ -152,11 +155,12 @@ define <4 x i32> @test8(<4 x i32> %x) { ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531] +; X86-NEXT: movapd %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index e7727a0ab6178..a1cabb433d879 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -563,18 +563,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: subq $120, %rsp ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X64-NEXT: psrlq $31, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; X64-NEXT: psrad $31, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm3, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: psrlq $31, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbp, %r14 +; X64-NEXT: movq %rbp, %r15 +; X64-NEXT: shlq $31, %r15 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 @@ -582,112 +584,113 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r12 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %rbp +; X64-NEXT: xorl %ebp, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rdx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rdx, %r13 +; X64-NEXT: movq %r12, %rax ; X64-NEXT: sbbq $0, %rax -; X64-NEXT: cmovgeq %rcx, %r14 -; X64-NEXT: cmovgeq %rdx, %rbp +; X64-NEXT: cmovgeq %rdx, %r13 +; X64-NEXT: cmovgeq %rcx, %r12 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rcx +; X64-NEXT: cmpq %r13, %rcx ; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movq %rbp, %xmm0 +; X64-NEXT: sbbq %r12, %rax +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbp, %r14 +; X64-NEXT: movq %rbp, %r15 +; X64-NEXT: shlq $31, %r15 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r12 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %rbp +; X64-NEXT: xorl %ebp, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %r12, %rax ; X64-NEXT: sbbq $0, %rax +; X64-NEXT: cmovgeq %rcx, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovgeq %rax, %r14 -; X64-NEXT: cmovgeq %rcx, %rbp +; X64-NEXT: cmovgeq %rax, %r12 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rcx +; X64-NEXT: cmpq %r13, %rcx ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movq %rbp, %xmm0 +; X64-NEXT: sbbq %r12, %rax +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; X64-NEXT: psrad $31, %xmm1 ; X64-NEXT: psrlq $31, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: psrad $31, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbp, %r14 +; X64-NEXT: movq %rbp, %r15 +; X64-NEXT: shlq $31, %r15 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: pxor %xmm1, %xmm1 ; X64-NEXT: pcmpgtd %xmm0, %xmm1 @@ -695,94 +698,92 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r12 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %rbp +; X64-NEXT: xorl %ebp, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %r12, %rax ; X64-NEXT: sbbq $0, %rax +; X64-NEXT: cmovgeq %rcx, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovgeq %rax, %r14 -; X64-NEXT: cmovgeq %rcx, %rbp +; X64-NEXT: cmovgeq %rax, %r12 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rcx +; X64-NEXT: cmpq %r13, %rcx ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movq %rbp, %xmm0 +; X64-NEXT: sbbq %r12, %rax +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbp, %r14 +; X64-NEXT: movq %rbp, %r15 +; X64-NEXT: shlq $31, %r15 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r12 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %rbp +; X64-NEXT: xorl %ebp, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %r12, %rax ; X64-NEXT: sbbq $0, %rax +; X64-NEXT: cmovgeq %rcx, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovgeq %rax, %r14 -; X64-NEXT: cmovgeq %rcx, %rbp +; X64-NEXT: cmovgeq %rax, %r12 ; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rax -; X64-NEXT: sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: cmovgeq %rax, %rbp -; X64-NEXT: movq %rbp, %xmm1 +; X64-NEXT: cmpq %r13, %rax +; X64-NEXT: sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: cmovgeq %rax, %r13 +; X64-NEXT: movq %r13, %xmm1 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: psrlq $1, %xmm0 diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index e53eed4587797..5f147784a74a3 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1921,6 +1921,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: psllq $32, %xmm0 @@ -1942,6 +1943,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-SSE-NEXT: psrad $16, %xmm0 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: psllq $32, %xmm0 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 08d9183bd30b6..bd19fa16a994b 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1098,7 +1098,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,u,1,u] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] @@ -1354,7 +1354,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -2068,11 +2068,12 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm1, %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrlq $32, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pand %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm0 ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2139,11 +2140,12 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm1, %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrlq $32, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pand %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm0 ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index f91758b861b4c..1ce34bd98f850 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -11,39 +11,37 @@ declare <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8>, <16 x i8>) define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X64-LABEL: vec_v2i64: ; X64: # %bb.0: -; X64-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X64-NEXT: movdqa %xmm2, %xmm3 -; X64-NEXT: psrlq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; X64-NEXT: movdqa %xmm2, %xmm5 -; X64-NEXT: psrlq %xmm4, %xmm5 -; X64-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X64-NEXT: movdqa %xmm0, %xmm6 -; X64-NEXT: psllq %xmm1, %xmm6 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psllq %xmm4, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm7 -; X64-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; X64-NEXT: psrlq %xmm1, %xmm6 -; X64-NEXT: psrlq %xmm4, %xmm7 -; X64-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; X64-NEXT: xorpd %xmm5, %xmm7 -; X64-NEXT: psubq %xmm5, %xmm7 -; X64-NEXT: pcmpeqd %xmm0, %xmm7 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,0,3,2] -; X64-NEXT: pand %xmm7, %xmm1 -; X64-NEXT: andpd %xmm1, %xmm3 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-NEXT: movdqa %xmm3, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X64-NEXT: pand %xmm2, %xmm0 ; X64-NEXT: pxor %xmm5, %xmm5 ; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: por %xmm2, %xmm5 -; X64-NEXT: pcmpeqd %xmm2, %xmm2 -; X64-NEXT: pxor %xmm5, %xmm2 -; X64-NEXT: por %xmm0, %xmm2 -; X64-NEXT: pandn %xmm2, %xmm1 -; X64-NEXT: por %xmm3, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: pand %xmm3, %xmm4 +; X64-NEXT: por %xmm3, %xmm5 +; X64-NEXT: psrlq %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; X64-NEXT: psrlq %xmm6, %xmm2 +; X64-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psllq %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm7 +; X64-NEXT: psllq %xmm6, %xmm7 +; X64-NEXT: movdqa %xmm7, %xmm8 +; X64-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; X64-NEXT: psrlq %xmm1, %xmm3 +; X64-NEXT: psrlq %xmm6, %xmm8 +; X64-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] +; X64-NEXT: xorpd %xmm2, %xmm8 +; X64-NEXT: psubq %xmm2, %xmm8 +; X64-NEXT: pcmpeqd %xmm0, %xmm8 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,0,3,2] +; X64-NEXT: pand %xmm8, %xmm0 +; X64-NEXT: andpd %xmm0, %xmm7 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm5, %xmm1 +; X64-NEXT: por %xmm4, %xmm1 +; X64-NEXT: pandn %xmm1, %xmm0 +; X64-NEXT: por %xmm7, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v2i64: @@ -278,63 +276,63 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-LABEL: vec_v8i16: ; X64: # %bb.0: +; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; X64-NEXT: pslld $23, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X64-NEXT: paddd %xmm3, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X64-NEXT: paddd %xmm4, %xmm2 ; X64-NEXT: cvttps2dq %xmm2, %xmm2 ; X64-NEXT: pslld $16, %xmm2 ; X64-NEXT: psrad $16, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; X64-NEXT: pslld $23, %xmm4 -; X64-NEXT: paddd %xmm3, %xmm4 -; X64-NEXT: cvttps2dq %xmm4, %xmm3 -; X64-NEXT: pslld $16, %xmm3 -; X64-NEXT: psrad $16, %xmm3 -; X64-NEXT: packssdw %xmm2, %xmm3 -; X64-NEXT: pmullw %xmm0, %xmm3 +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; X64-NEXT: pslld $23, %xmm5 +; X64-NEXT: paddd %xmm4, %xmm5 +; X64-NEXT: cvttps2dq %xmm5, %xmm4 +; X64-NEXT: pslld $16, %xmm4 +; X64-NEXT: psrad $16, %xmm4 +; X64-NEXT: packssdw %xmm2, %xmm4 +; X64-NEXT: pmullw %xmm0, %xmm4 ; X64-NEXT: psllw $12, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm2 ; X64-NEXT: psraw $15, %xmm2 -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: psraw $8, %xmm4 -; X64-NEXT: pand %xmm2, %xmm4 -; X64-NEXT: pandn %xmm3, %xmm2 -; X64-NEXT: por %xmm4, %xmm2 -; X64-NEXT: paddw %xmm1, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: psraw $15, %xmm4 ; X64-NEXT: movdqa %xmm4, %xmm5 -; X64-NEXT: pandn %xmm2, %xmm5 -; X64-NEXT: psraw $4, %xmm2 -; X64-NEXT: pand %xmm4, %xmm2 +; X64-NEXT: psraw $8, %xmm5 +; X64-NEXT: pand %xmm2, %xmm5 +; X64-NEXT: pandn %xmm4, %xmm2 ; X64-NEXT: por %xmm5, %xmm2 ; X64-NEXT: paddw %xmm1, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: psraw $15, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm5 -; X64-NEXT: pandn %xmm2, %xmm5 +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: psraw $15, %xmm5 +; X64-NEXT: movdqa %xmm5, %xmm6 +; X64-NEXT: pandn %xmm2, %xmm6 +; X64-NEXT: psraw $4, %xmm2 +; X64-NEXT: pand %xmm5, %xmm2 +; X64-NEXT: por %xmm6, %xmm2 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: psraw $15, %xmm5 +; X64-NEXT: movdqa %xmm5, %xmm6 +; X64-NEXT: pandn %xmm2, %xmm6 ; X64-NEXT: psraw $2, %xmm2 -; X64-NEXT: pand %xmm4, %xmm2 -; X64-NEXT: por %xmm5, %xmm2 +; X64-NEXT: pand %xmm5, %xmm2 +; X64-NEXT: por %xmm6, %xmm2 ; X64-NEXT: paddw %xmm1, %xmm1 ; X64-NEXT: psraw $15, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: pandn %xmm2, %xmm4 +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: pandn %xmm2, %xmm5 ; X64-NEXT: psraw $1, %xmm2 ; X64-NEXT: pand %xmm1, %xmm2 -; X64-NEXT: por %xmm4, %xmm2 +; X64-NEXT: por %xmm5, %xmm2 ; X64-NEXT: pcmpeqw %xmm0, %xmm2 -; X64-NEXT: pand %xmm2, %xmm3 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpgtw %xmm0, %xmm1 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: pcmpgtw %xmm0, %xmm3 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: por %xmm3, %xmm0 ; X64-NEXT: pandn %xmm0, %xmm2 -; X64-NEXT: por %xmm3, %xmm2 +; X64-NEXT: por %xmm4, %xmm2 ; X64-NEXT: movdqa %xmm2, %xmm0 ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 88df3c175ec9c..8b3d249a82b61 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1264,24 +1264,28 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1475,55 +1479,63 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] ; SSE41-NEXT: pand %xmm7, %xmm8 -; SSE41-NEXT: por %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm2 ; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm6, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: psubq %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: pxor %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1844,103 +1856,119 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] ; SSE41-NEXT: pand %xmm11, %xmm12 -; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm12, %xmm4 ; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm9 -; SSE41-NEXT: por %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm13 -; SSE41-NEXT: por %xmm0, %xmm13 -; SSE41-NEXT: pxor %xmm10, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm13, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm7 ; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm7, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll index 953a0d65c5386..eb44e10b188ac 100644 --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -65,7 +65,7 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) { ; ; CHECK-X64-LABEL: fail: ; CHECK-X64: # %bb.0: -; CHECK-X64-NEXT: pslld $8, %xmm0 +; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: xorb $1, %al diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll index 6a52acfe2fb30..a014802d01649 100644 --- a/llvm/test/CodeGen/X86/ucmp.ll +++ b/llvm/test/CodeGen/X86/ucmp.ll @@ -1885,21 +1885,18 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $88, %rsp +; SSE2-NEXT: subq $96, %rsp ; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: andl $127, %r8d -; SSE2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: andl $127, %edx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1914,7 +1911,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax -; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1926,6 +1923,9 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax +; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax @@ -1957,277 +1957,284 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: andl $127, %ecx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: andl $127, %ebx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: andl $127, %edx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: andl $127, %r10d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: andl $127, %r14d -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE2-NEXT: andl $127, %ebp +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE2-NEXT: andl $127, %r15d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE2-NEXT: andl $127, %r12d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; SSE2-NEXT: andl $127, %r13d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE2-NEXT: andl $127, %ebp +; SSE2-NEXT: andl $127, %edx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE2-NEXT: andl $127, %r11d -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE2-NEXT: andl $127, %r15d -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE2-NEXT: cmpq %rax, %r12 -; SSE2-NEXT: movq %r15, %r8 -; SSE2-NEXT: sbbq %r11, %r8 -; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %r12, %rax -; SSE2-NEXT: sbbq %r15, %r11 -; SSE2-NEXT: sbbb $0, %r8b -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %r13, %r11 -; SSE2-NEXT: sbbq %rbp, %r11 -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %r13, %rbp -; SSE2-NEXT: sbbb $0, %r11b -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %r14, %r11 -; SSE2-NEXT: sbbq %r10, %r11 -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %r14, %r10 -; SSE2-NEXT: sbbb $0, %r11b -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movq %r8, %rcx +; SSE2-NEXT: andl $127, %ecx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %rdx, %r10 -; SSE2-NEXT: sbbq %rbx, %r10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: cmpq %rsi, %r8 +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: sbbq %rcx, %r10 ; SSE2-NEXT: setb %r10b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %rdx, %rbx +; SSE2-NEXT: cmpq %r8, %rsi +; SSE2-NEXT: sbbq %rax, %rcx +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: sbbb $0, %r10b ; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: cmpq %rax, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: sbbq %rcx, %r8 -; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %rdx, %rax -; SSE2-NEXT: sbbq %r10, %rcx -; SSE2-NEXT: sbbb $0, %r8b -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: cmpq %r10, %rax +; SSE2-NEXT: movq %r11, %rcx +; SSE2-NEXT: sbbq %rdx, %rcx +; SSE2-NEXT: setb %cl +; SSE2-NEXT: cmpq %rax, %r10 +; SSE2-NEXT: sbbq %r11, %rdx +; SSE2-NEXT: sbbb $0, %cl +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: cmpq %r9, %rax +; SSE2-NEXT: movq %rbp, %rcx +; SSE2-NEXT: sbbq %r13, %rcx +; SSE2-NEXT: setb %dil +; SSE2-NEXT: cmpq %rax, %r9 +; SSE2-NEXT: sbbq %rbp, %r13 +; SSE2-NEXT: sbbb $0, %dil ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE2-NEXT: sbbq %r8, %rdx -; SSE2-NEXT: setb %dl +; SSE2-NEXT: movq %r12, %r9 +; SSE2-NEXT: sbbq %r15, %r9 +; SSE2-NEXT: setb %r11b ; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %r10, %r8 -; SSE2-NEXT: sbbb $0, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: sbbq %r12, %r15 +; SSE2-NEXT: sbbb $0, %r11b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE2-NEXT: sbbq %r8, %rdx -; SSE2-NEXT: setb %dl +; SSE2-NEXT: movq %r14, %r9 +; SSE2-NEXT: sbbq %rbx, %r9 +; SSE2-NEXT: setb %r9b ; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %r10, %r8 -; SSE2-NEXT: sbbb $0, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: sbbq %r14, %rbx +; SSE2-NEXT: sbbb $0, %r9b +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: movq %r11, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx -; SSE2-NEXT: setb %r8b +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: movq %rsi, %r9 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE2-NEXT: sbbq %rdx, %r9 +; SSE2-NEXT: setb %r9b ; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %r11, %r10 -; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: sbbq %rsi, %rdx +; SSE2-NEXT: sbbb $0, %r9b +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: movq %rbx, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx -; SSE2-NEXT: setb %r11b +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: movq %r8, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: sbbq %rsi, %rdx +; SSE2-NEXT: setb %dl ; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %rbx, %r10 -; SSE2-NEXT: sbbb $0, %r11b +; SSE2-NEXT: sbbq %r8, %rsi +; SSE2-NEXT: sbbb $0, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: movq %rbx, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: movq %r8, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: sbbq %rsi, %rdx ; SSE2-NEXT: setb %dl ; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %rbx, %r10 +; SSE2-NEXT: sbbq %r8, %rsi ; SSE2-NEXT: sbbb $0, %dl -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rcx, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: movq %r8, %rax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: sbbq %rsi, %rax +; SSE2-NEXT: setb %r9b +; SSE2-NEXT: cmpq %rdx, %rcx +; SSE2-NEXT: sbbq %r8, %rsi +; SSE2-NEXT: sbbb $0, %r9b +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: cmpq %rdx, %rsi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: movq %r8, %rcx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %rcx +; SSE2-NEXT: setb %cl +; SSE2-NEXT: cmpq %rsi, %rdx +; SSE2-NEXT: sbbq %r8, %rax +; SSE2-NEXT: sbbb $0, %cl +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: cmpq %rsi, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE2-NEXT: movq %r10, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %rdx +; SSE2-NEXT: setb %dl +; SSE2-NEXT: cmpq %r8, %rsi +; SSE2-NEXT: sbbq %r10, %rax +; SSE2-NEXT: sbbb $0, %dl +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: cmpq %r8, %r10 +; SSE2-NEXT: movq (%rsp), %rbx # 8-byte Reload +; SSE2-NEXT: movq %rbx, %rsi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %rsi +; SSE2-NEXT: setb %sil +; SSE2-NEXT: cmpq %r10, %r8 +; SSE2-NEXT: sbbq %rbx, %rax +; SSE2-NEXT: sbbb $0, %sil +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: cmpq %r10, %rbx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: movq %r14, %r10 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: sbbq %rbx, %r10 -; SSE2-NEXT: setb %r10b -; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %r14, %rbx -; SSE2-NEXT: sbbb $0, %r10b -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %r14, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %r8 +; SSE2-NEXT: setb %r8b +; SSE2-NEXT: cmpq %rbx, %r10 +; SSE2-NEXT: sbbq %r14, %rax +; SSE2-NEXT: sbbb $0, %r8b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; SSE2-NEXT: cmpq %rax, %rbx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE2-NEXT: cmpq %rbx, %r14 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: movq %r15, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %rcx -; SSE2-NEXT: setb %cl -; SSE2-NEXT: cmpq %rbx, %rax -; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: sbbb $0, %cl -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %r15, %r10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %r10 +; SSE2-NEXT: setb %r10b +; SSE2-NEXT: cmpq %r14, %rbx +; SSE2-NEXT: sbbq %r15, %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE2-NEXT: cmpq %rax, %r14 -; SSE2-NEXT: movq (%rsp), %r12 # 8-byte Reload +; SSE2-NEXT: sbbb $0, %r10b +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE2-NEXT: cmpq %r14, %r15 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; SSE2-NEXT: movq %r12, %rbx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: sbbq %r15, %rbx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %rbx ; SSE2-NEXT: setb %bl -; SSE2-NEXT: cmpq %r14, %rax -; SSE2-NEXT: sbbq %r12, %r15 +; SSE2-NEXT: cmpq %r15, %r14 +; SSE2-NEXT: sbbq %r12, %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSE2-NEXT: sbbb $0, %bl -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: cmpq %r9, %rax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: movq %r12, %r14 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: sbbq %r15, %r14 +; SSE2-NEXT: cmpq %r14, %r15 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; SSE2-NEXT: movq %r13, %r12 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: sbbq %rax, %r12 ; SSE2-NEXT: setb %bpl -; SSE2-NEXT: cmpq %rax, %r9 -; SSE2-NEXT: sbbq %r12, %r15 -; SSE2-NEXT: sbbb $0, %bpl -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: cmpq %rsi, %rax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: movq %r15, %r9 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %r9 -; SSE2-NEXT: setb %r9b -; SSE2-NEXT: cmpq %rax, %rsi -; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: sbbb $0, %r9b -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: cmpq %r15, %rsi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: movq %r12, %rdi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %rdi -; SSE2-NEXT: setb %dil -; SSE2-NEXT: cmpq %rsi, %r15 -; SSE2-NEXT: sbbq %r12, %r14 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: cmpq %r15, %r14 +; SSE2-NEXT: sbbq %r13, %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE2-NEXT: sbbb $0, %dil -; SSE2-NEXT: cmpq %rsi, %r14 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE2-NEXT: sbbb $0, %bpl +; SSE2-NEXT: cmpq %r14, %r15 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movq %rax, %r12 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE2-NEXT: movq %r13, %r15 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: sbbq %r12, %r15 -; SSE2-NEXT: setb %r15b -; SSE2-NEXT: cmpq %r14, %rsi ; SSE2-NEXT: sbbq %r13, %r12 -; SSE2-NEXT: sbbb $0, %r15b -; SSE2-NEXT: movzbl %r15b, %esi -; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: movb %sil, 4(%rax) -; SSE2-NEXT: movzbl %dil, %esi -; SSE2-NEXT: movzbl %r9b, %edi -; SSE2-NEXT: andl $3, %esi +; SSE2-NEXT: setb %r12b +; SSE2-NEXT: cmpq %r15, %r14 +; SSE2-NEXT: sbbq %rax, %r13 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movzbl %dil, %r14d +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movzbl %r11b, %edi +; SSE2-NEXT: sbbb $0, %r12b +; SSE2-NEXT: movzbl %r12b, %r11d +; SSE2-NEXT: pinsrb $1, %r15d, %xmm0 +; SSE2-NEXT: pinsrb $2, %r14d, %xmm0 +; SSE2-NEXT: pinsrb $3, %edi, %xmm0 +; SSE2-NEXT: andl $3, %r11d +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE2-NEXT: movb %r11b, 4(%r12) +; SSE2-NEXT: movd %xmm0, %r11d +; SSE2-NEXT: andl $3, %r11d +; SSE2-NEXT: andl $3, %r15d +; SSE2-NEXT: leal (%r11,%r15,4), %r11d +; SSE2-NEXT: andl $3, %r14d +; SSE2-NEXT: shll $4, %r14d +; SSE2-NEXT: orl %r11d, %r14d ; SSE2-NEXT: andl $3, %edi -; SSE2-NEXT: leaq (%rdi,%rsi,4), %rsi -; SSE2-NEXT: movzbl %bpl, %edi +; SSE2-NEXT: shll $6, %edi +; SSE2-NEXT: orl %r14d, %edi +; SSE2-NEXT: movzbl %bpl, %r11d +; SSE2-NEXT: andl $3, %r11d +; SSE2-NEXT: shll $8, %r11d +; SSE2-NEXT: orl %edi, %r11d +; SSE2-NEXT: movzbl %bl, %edi ; SSE2-NEXT: andl $3, %edi -; SSE2-NEXT: shll $4, %edi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: movzbl %bl, %r9d -; SSE2-NEXT: andl $3, %r9d -; SSE2-NEXT: shll $6, %r9d -; SSE2-NEXT: orq %rdi, %r9 -; SSE2-NEXT: movzbl %cl, %esi +; SSE2-NEXT: shll $10, %edi +; SSE2-NEXT: orl %r11d, %edi +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movzbl %r10b, %r10d +; SSE2-NEXT: andl $3, %r10d +; SSE2-NEXT: shll $12, %r10d +; SSE2-NEXT: andl $3, %r8d +; SSE2-NEXT: shll $14, %r8d +; SSE2-NEXT: orl %r10d, %r8d +; SSE2-NEXT: movzbl %sil, %esi ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $8, %esi -; SSE2-NEXT: orq %r9, %rsi -; SSE2-NEXT: movzbl %dl, %ecx -; SSE2-NEXT: movzbl %r10b, %edx +; SSE2-NEXT: shll $16, %esi +; SSE2-NEXT: orl %r8d, %esi +; SSE2-NEXT: movzbl %dl, %edx ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $10, %edx -; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shll $12, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: movzbl %r11b, %edx -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $14, %edx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: movzbl %r8b, %ecx +; SSE2-NEXT: shll $18, %edx +; SSE2-NEXT: orl %esi, %edx +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shll $16, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $18, %esi -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $20, %edx -; SSE2-NEXT: orq %rsi, %rdx -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $22, %esi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: shll $20, %ecx +; SSE2-NEXT: orl %edx, %ecx +; SSE2-NEXT: movzbl %r9b, %edx ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $24, %edx -; SSE2-NEXT: orq %rsi, %rdx -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shlq $26, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: shll $22, %edx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %eax +; SSE2-NEXT: shll $24, %eax +; SSE2-NEXT: orl %edx, %eax +; SSE2-NEXT: orl %edi, %eax ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shlq $28, %rdx +; SSE2-NEXT: shlq $26, %rdx ; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shlq $30, %rcx +; SSE2-NEXT: shlq $28, %rcx ; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: movl %ecx, (%rax) -; SSE2-NEXT: addq $88, %rsp +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %edx +; SSE2-NEXT: shlq $30, %rdx +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %r12, %rax +; SSE2-NEXT: movl %edx, (%r12) +; SSE2-NEXT: addq $96, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll index 271d11edff9a7..6d255f6bcf303 100644 --- a/llvm/test/CodeGen/X86/udiv-exact.ll +++ b/llvm/test/CodeGen/X86/udiv-exact.ll @@ -83,11 +83,12 @@ define <4 x i32> @test5(<4 x i32> %x) { ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrld $3, %xmm1 ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145] +; X86-NEXT: movaps %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -108,11 +109,12 @@ define <4 x i32> @test6(<4 x i32> %x) { ; X86-NEXT: psrld $3, %xmm1 ; X86-NEXT: psrld $1, %xmm0 ; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] -; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997] +; X86-NEXT: movapd %xmm0, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqa %xmm1, %xmm0 @@ -130,11 +132,12 @@ define <4 x i32> @test6(<4 x i32> %x) { define <4 x i32> @test7(<4 x i32> %x) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl ; @@ -152,11 +155,12 @@ define <4 x i32> @test8(<4 x i32> %x) { ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrld $3, %xmm1 ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531] +; X86-NEXT: movapd %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 97cc1f8a15694..5fcb80549fcc8 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -137,14 +137,12 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE2-NEXT: movd %edx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,1463,819,u] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7 -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] ; SSE2-NEXT: movdqa %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 838086e366fbf..11808ea1def8e 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -11,17 +11,17 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even: @@ -160,18 +160,22 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -238,18 +242,22 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -318,17 +326,17 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq: @@ -391,17 +399,17 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne: @@ -467,9 +475,11 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -483,7 +493,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -537,18 +547,22 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -617,9 +631,13 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -823,17 +841,17 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_one: @@ -899,9 +917,11 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -915,7 +935,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2] +; CHECK-SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [1,1,2,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -969,18 +989,22 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq @@ -1049,9 +1073,13 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1129,9 +1157,11 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -1145,7 +1175,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1199,9 +1229,13 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1278,9 +1312,13 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1396,9 +1434,13 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1475,17 +1517,17 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one: @@ -1552,17 +1594,17 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one: @@ -1627,9 +1669,13 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1706,17 +1752,17 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one: @@ -1781,9 +1827,11 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -1797,7 +1845,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1849,9 +1897,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -1865,7 +1915,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,1] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index ebb5e135eacd0..ed523e31b0569 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -222,22 +222,23 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-LABEL: vec_v8i16: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; X64-NEXT: pslld $23, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X64-NEXT: paddd %xmm3, %xmm2 -; X64-NEXT: cvttps2dq %xmm2, %xmm4 -; X64-NEXT: pslld $16, %xmm4 -; X64-NEXT: psrad $16, %xmm4 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; X64-NEXT: pslld $23, %xmm2 -; X64-NEXT: paddd %xmm3, %xmm2 -; X64-NEXT: cvttps2dq %xmm2, %xmm2 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-NEXT: pslld $23, %xmm3 +; X64-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X64-NEXT: paddd %xmm4, %xmm3 +; X64-NEXT: cvttps2dq %xmm3, %xmm3 +; X64-NEXT: pslld $16, %xmm3 +; X64-NEXT: psrad $16, %xmm3 +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; X64-NEXT: pslld $23, %xmm5 +; X64-NEXT: paddd %xmm4, %xmm5 +; X64-NEXT: cvttps2dq %xmm5, %xmm2 ; X64-NEXT: pslld $16, %xmm2 ; X64-NEXT: psrad $16, %xmm2 -; X64-NEXT: packssdw %xmm4, %xmm2 +; X64-NEXT: packssdw %xmm3, %xmm2 ; X64-NEXT: pmullw %xmm0, %xmm2 ; X64-NEXT: psllw $12, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index a336d0a01fa7b..9dcc60491352d 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -1084,54 +1084,60 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; ; AVX1-64-LABEL: uitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-64-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-64-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-64-NEXT: vmovq %xmm1, %rax +; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-64-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-64-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; AVX1-64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-64-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-64-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-64-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX1-64-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0 +; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-64-NEXT: vmovq %xmm0, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; AVX1-64-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-64-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm1 +; AVX1-64-NEXT: vmovq %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-64-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-64-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; AVX1-64-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; ; AVX2-64-LABEL: uitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-64-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX2-64-NEXT: vpsrlq $1, %ymm0, %ymm2 -; AVX2-64-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-64-NEXT: vmovq %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-64-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 +; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm2 +; AVX2-64-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-64-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-64-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; AVX2-64-NEXT: vmovq %xmm0, %rax +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-64-NEXT: vmovq %xmm1, %rax +; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX2-64-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-64-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index af841cf38b24a..e08fd5e06e8ca 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2127,13 +2127,18 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper @@ -2160,7 +2165,10 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2486,54 +2494,60 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; ; AVX1-LABEL: uitofp_4i64_to_4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_4i64_to_4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4227,54 +4241,61 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; AVX1-LABEL: uitofp_load_4i64_to_4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_4i64_to_4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4662,13 +4683,14 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm3 -; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpextrq $1, %xmm4, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[2,3] +; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] @@ -4676,6 +4698,9 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0] ; AVX1-NEXT: vaddps %xmm3, %xmm3, %xmm4 +; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm2 @@ -4684,13 +4709,14 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 -; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3] +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] @@ -4698,6 +4724,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] ; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -4707,45 +4735,50 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm6 +; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vblendvpd %ymm1, %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[0] +; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm5 +; AVX2-NEXT: vblendvps %xmm3, %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm3 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4 ; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm5 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3] -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0] -; AVX2-NEXT: vaddps %xmm3, %xmm3, %xmm4 -; AVX2-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1 -; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; AVX2-NEXT: vaddps %xmm2, %xmm2, %xmm3 -; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_minmax_sint.ll b/llvm/test/CodeGen/X86/vec_minmax_sint.ll index 853e29b8acfcd..c8d988cb011ae 100644 --- a/llvm/test/CodeGen/X86/vec_minmax_sint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_sint.ll @@ -34,15 +34,17 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: max_gt_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -125,20 +127,24 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -415,15 +421,17 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: max_ge_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -506,20 +514,24 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -796,14 +808,17 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: min_lt_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -886,20 +901,24 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm2, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -1177,14 +1196,17 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: min_le_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -1267,20 +1289,24 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm2, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll index 9b4da3f9b817f..c9bfff4b7dfd7 100644 --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -34,15 +34,17 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: max_gt_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -136,20 +138,24 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -442,15 +448,17 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: max_ge_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -544,20 +552,24 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -850,14 +862,17 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: min_lt_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -950,20 +965,24 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm2, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -1260,14 +1279,17 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: min_le_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 @@ -1360,20 +1382,24 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-NEXT: movdqa %xmm2, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index a54ff67f74755..0567cb2ac74b6 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -425,69 +425,71 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE2-NEXT: movd %r9d, %xmm3 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm7, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 -; SSE2-NEXT: pand %xmm4, %xmm8 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: paddd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE2-NEXT: psubd %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE2-NEXT: movdqa %xmm6, (%rcx) +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: psubd %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: psubd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pmuludq %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %xmm0, 16(%rcx) -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: movq %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, 16(%rcx) +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: movq %xmm1, 16(%rdi) +; SSE2-NEXT: movdqa %xmm6, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v6i32: @@ -497,115 +499,117 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSSE3-NEXT: movd %r9d, %xmm3 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm7, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 -; SSSE3-NEXT: pand %xmm4, %xmm8 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: paddd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSSE3-NEXT: psubd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: movdqa %xmm4, (%rcx) -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: paddd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSSE3-NEXT: psubd %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSSE3-NEXT: movdqa %xmm6, (%rcx) +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm7, %xmm6 -; SSSE3-NEXT: paddd %xmm8, %xmm6 -; SSSE3-NEXT: pmuludq %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: psubd %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: paddd %xmm7, %xmm4 +; SSSE3-NEXT: pmuludq %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSSE3-NEXT: pmuludq %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: psubd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSSE3-NEXT: pmuludq %xmm7, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %xmm0, 16(%rcx) -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: movq %xmm0, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movq %xmm1, 16(%rcx) +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: movq %xmm1, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm6, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrd $1, %edx, %xmm2 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmuldq %xmm2, %xmm0 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE41-NEXT: movd %r9d, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pmuldq %xmm3, %xmm4 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm3 +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 ; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx -; SSE41-NEXT: pinsrd $1, %edx, %xmm5 -; SSE41-NEXT: pmulld %xmm3, %xmm5 +; SSE41-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, %edx, %xmm4 +; SSE41-NEXT: movd %r9d, %xmm2 +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SSE41-NEXT: pinsrd $1, %esi, %xmm2 +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE41-NEXT: movd %edx, %xmm6 -; SSE41-NEXT: pmuldq %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] -; SSE41-NEXT: movq %xmm5, 16(%rsi) -; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: movd %esi, %xmm3 +; SSE41-NEXT: pmuldq %xmm6, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] +; SSE41-NEXT: pmulld %xmm4, %xmm2 +; SSE41-NEXT: movq %xmm2, 16(%rcx) +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm4, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] -; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, (%rsi) +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, (%rcx) ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: movq %xmm5, 16(%rdi) +; SSE41-NEXT: movq %xmm2, 16(%rdi) ; SSE41-NEXT: movdqa %xmm1, (%rdi) ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 62db6d234d301..2b76f02b57e3c 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -366,9 +366,9 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -376,37 +376,42 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd %esi, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] +; SSE2-NEXT: pmuludq %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pmuludq %xmm8, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movq %xmm7, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movq %xmm4, 16(%rdi) +; SSE2-NEXT: movdqa %xmm2, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v6i32: @@ -416,9 +421,9 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -426,83 +431,88 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd %esi, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSSE3-NEXT: pmuludq %xmm2, %xmm0 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 +; SSSE3-NEXT: pxor %xmm7, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] +; SSSE3-NEXT: pmuludq %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSSE3-NEXT: pmuludq %xmm8, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: movdqa %xmm3, (%rcx) -; SSSE3-NEXT: movq %xmm7, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movq %xmm4, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm2, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edi -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, %edi, %xmm0 ; SSE41-NEXT: movd %r9d, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm1 -; SSE41-NEXT: pinsrd $1, %edi, %xmm2 ; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %r9d -; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 -; SSE41-NEXT: pmulld %xmm2, %xmm0 +; SSE41-NEXT: pinsrd $1, %r9d, %xmm1 ; SSE41-NEXT: movd %esi, %xmm2 ; SSE41-NEXT: pinsrd $1, %edx, %xmm2 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm4 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pmuludq %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] ; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pxor %xmm6, %xmm3 -; SSE41-NEXT: movd %edi, %xmm7 -; SSE41-NEXT: movd %r9d, %xmm8 -; SSE41-NEXT: pmuludq %xmm7, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm1 -; SSE41-NEXT: pmulld %xmm2, %xmm4 -; SSE41-NEXT: movq %xmm0, 16(%rcx) -; SSE41-NEXT: movdqa %xmm4, (%rcx) -; SSE41-NEXT: movq %xmm1, 16(%rax) -; SSE41-NEXT: movdqa %xmm3, (%rax) +; SSE41-NEXT: pxor %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: pmuludq %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE41-NEXT: movd %edi, %xmm8 +; SSE41-NEXT: movd %r9d, %xmm9 +; SSE41-NEXT: pmuludq %xmm8, %xmm9 +; SSE41-NEXT: pblendw {{.*#+}} xmm9 = xmm7[0,1],xmm9[2,3],xmm7[4,5],xmm9[6,7] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: pmulld %xmm2, %xmm3 +; SSE41-NEXT: movq %xmm1, 16(%rcx) +; SSE41-NEXT: movdqa %xmm3, (%rcx) +; SSE41-NEXT: movq %xmm9, 16(%rax) +; SSE41-NEXT: movdqa %xmm4, (%rax) ; SSE41-NEXT: retq ; ; AVX1-LABEL: umulo_v6i32: diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index bf027a7346deb..8995989ef4474 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -272,10 +272,12 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax @@ -317,21 +319,25 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %ecx +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: cmpl $3, %ecx ; SSE2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index 2df39d69dbb75..24fee35c73404 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -223,10 +223,12 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax @@ -266,21 +268,25 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %ecx +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: sbbq %rax, %rax diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 49062eaef3188..dfbf0c3c92347 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -7983,28 +7983,32 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; ; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index fd0525e6d56a2..fb007c7ee8aaf 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -429,24 +429,25 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: pslld $23, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm6, %xmm5 +; SSE2-NEXT: cvttps2dq %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 @@ -464,18 +465,19 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm5, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm4, %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE41-NEXT: pmulld %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm2 ; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE41-NEXT: pslld $23, %xmm4 -; SSE41-NEXT: paddd %xmm5, %xmm4 -; SSE41-NEXT: cvttps2dq %xmm4, %xmm0 +; SSE41-NEXT: pslld $23, %xmm5 +; SSE41-NEXT: paddd %xmm4, %xmm5 +; SSE41-NEXT: cvttps2dq %xmm5, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 @@ -485,7 +487,8 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 @@ -616,24 +619,25 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm5 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm4, %xmm5 -; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm5, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X86-SSE2-NEXT: pslld $23, %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm5, %xmm6 +; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm6, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm6, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm7, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; X86-SSE2-NEXT: psrad $16, %xmm3 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm4, %xmm2 +; X86-SSE2-NEXT: paddd %xmm5, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 @@ -653,47 +657,47 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; SSE2-LABEL: var_funnnel_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE2-NEXT: pslld $23, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm6 ; SSE2-NEXT: cvttps2dq %xmm6, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $23, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm4 -; SSE2-NEXT: cvttps2dq %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: pslld $23, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: cvttps2dq %xmm5, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: packssdw %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm7, %xmm4 -; SSE2-NEXT: psrlw $8, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm5 -; SSE2-NEXT: paddd %xmm3, %xmm5 -; SSE2-NEXT: cvttps2dq %xmm5, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm7, %xmm5 +; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE2-NEXT: pslld $23, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: cvttps2dq %xmm6, %xmm6 +; SSE2-NEXT: pslld $16, %xmm6 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 +; SSE2-NEXT: packssdw %xmm6, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: pmullw %xmm1, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -704,32 +708,32 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm7, %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: pslld $23, %xmm3 -; SSE41-NEXT: paddd %xmm6, %xmm3 -; SSE41-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE41-NEXT: packusdw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm3, %xmm7 -; SSE41-NEXT: psrlw $8, %xmm7 +; SSE41-NEXT: pslld $23, %xmm6 +; SSE41-NEXT: paddd %xmm7, %xmm6 +; SSE41-NEXT: cvttps2dq %xmm6, %xmm6 +; SSE41-NEXT: packusdw %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm6, %xmm8 +; SSE41-NEXT: psrlw $8, %xmm8 ; SSE41-NEXT: pslld $23, %xmm4 -; SSE41-NEXT: paddd %xmm6, %xmm4 +; SSE41-NEXT: paddd %xmm7, %xmm4 ; SSE41-NEXT: cvttps2dq %xmm4, %xmm2 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE41-NEXT: pslld $23, %xmm5 -; SSE41-NEXT: paddd %xmm6, %xmm5 +; SSE41-NEXT: paddd %xmm7, %xmm5 ; SSE41-NEXT: cvttps2dq %xmm5, %xmm3 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm7, %xmm2 +; SSE41-NEXT: packuswb %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -737,34 +741,34 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v16i8: @@ -916,47 +920,47 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pxor %xmm5, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; X86-SSE2-NEXT: pslld $23, %xmm6 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm4, %xmm6 ; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6 ; X86-SSE2-NEXT: pslld $16, %xmm6 ; X86-SSE2-NEXT: psrad $16, %xmm6 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; X86-SSE2-NEXT: pslld $23, %xmm4 -; X86-SSE2-NEXT: paddd %xmm3, %xmm4 -; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm7 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; X86-SSE2-NEXT: pslld $23, %xmm5 +; X86-SSE2-NEXT: paddd %xmm4, %xmm5 +; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm7 ; X86-SSE2-NEXT: pslld $16, %xmm7 ; X86-SSE2-NEXT: psrad $16, %xmm7 ; X86-SSE2-NEXT: packssdw %xmm6, %xmm7 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; X86-SSE2-NEXT: pmullw %xmm7, %xmm4 -; X86-SSE2-NEXT: psrlw $8, %xmm4 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm5 -; X86-SSE2-NEXT: paddd %xmm3, %xmm5 -; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5 -; X86-SSE2-NEXT: pslld $16, %xmm5 -; X86-SSE2-NEXT: psrad $16, %xmm5 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; X86-SSE2-NEXT: pmullw %xmm7, %xmm5 +; X86-SSE2-NEXT: psrlw $8, %xmm5 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; X86-SSE2-NEXT: pslld $23, %xmm6 +; X86-SSE2-NEXT: paddd %xmm4, %xmm6 +; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6 +; X86-SSE2-NEXT: pslld $16, %xmm6 +; X86-SSE2-NEXT: psrad $16, %xmm6 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm3, %xmm2 +; X86-SSE2-NEXT: paddd %xmm4, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 -; X86-SSE2-NEXT: packssdw %xmm5, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm6, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE2-NEXT: pmullw %xmm1, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm2 -; X86-SSE2-NEXT: packuswb %xmm4, %xmm2 +; X86-SSE2-NEXT: packuswb %xmm5, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) @@ -1319,19 +1323,34 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % } define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: psllw %xmm2, %xmm3 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: psllw %xmm2, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: psllw %xmm2, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psllw %xmm2, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: psllw %xmm2, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: psllw %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v16i8: ; AVX: # %bb.0: @@ -1429,6 +1448,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index fdd0d68b89003..d046c6f04898f 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -320,32 +320,33 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm7, %xmm7 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 -; AVX1-NEXT: vpmulld %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-NEXT: vpslld $23, %xmm8, %xmm8 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vcvttps2dq %xmm8, %xmm8 +; AVX1-NEXT: vpmulld %xmm8, %xmm5, %xmm5 ; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -480,17 +481,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-LABEL: var_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm3, %xmm7 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-NEXT: vpslld $23, %xmm4, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 @@ -498,48 +499,48 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpslld $23, %xmm9, %xmm9 -; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9 +; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9 ; AVX1-NEXT: vcvttps2dq %xmm9, %xmm9 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 -; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-NEXT: vpslld $23, %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $23, %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index b763b7bac2432..27f7204b4bdd4 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -306,21 +306,22 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2 ; SSE2-NEXT: pmullw %xmm1, %xmm0 @@ -330,15 +331,16 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE41-LABEL: var_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: pslld $23, %xmm3 +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmulhuw %xmm2, %xmm1 @@ -349,7 +351,8 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -453,17 +456,18 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm4, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; X86-SSE2-NEXT: pslld $23, %xmm1 -; X86-SSE2-NEXT: paddd %xmm3, %xmm1 +; X86-SSE2-NEXT: paddd %xmm4, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE2-NEXT: pslld $16, %xmm1 ; X86-SSE2-NEXT: psrad $16, %xmm1 @@ -1056,18 +1060,32 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind } define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE-NEXT: psllw %xmm1, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psllw %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v16i8: ; AVX: # %bb.0: @@ -1168,9 +1186,10 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psllw %xmm1, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 9e872cc6d74a9..3ed10627cd176 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -234,28 +234,29 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6 -; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7 +; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index a56b0a6351a3b..f10fec2638487 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -461,23 +461,24 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: pslld $23, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pslld $23, %xmm2 +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm4, %xmm1 ; SSE2-NEXT: paddw %xmm0, %xmm0 -; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; @@ -511,15 +512,16 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: pandn %xmm5, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm4, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm0, %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: pslld $23, %xmm0 -; SSE41-NEXT: paddd %xmm4, %xmm0 -; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE41-NEXT: pslld $23, %xmm4 +; SSE41-NEXT: paddd %xmm0, %xmm4 +; SSE41-NEXT: cvttps2dq %xmm4, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 ; SSE41-NEXT: paddw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm0, %xmm3 @@ -546,7 +548,8 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 @@ -704,17 +707,18 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; X86-SSE2-NEXT: pand %xmm4, %xmm3 ; X86-SSE2-NEXT: por %xmm1, %xmm3 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; X86-SSE2-NEXT: pslld $23, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm5, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE2-NEXT: pslld $16, %xmm1 ; X86-SSE2-NEXT: psrad $16, %xmm1 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm4, %xmm2 +; X86-SSE2-NEXT: paddd %xmm5, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 @@ -1447,6 +1451,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 @@ -1588,6 +1593,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 0fa2c858ff000..ada7430a6d345 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -339,36 +339,37 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm6 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm4, %xmm7 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-NEXT: vpslld $23, %xmm4, %xmm8 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8 +; AVX1-NEXT: vcvttps2dq %xmm8, %xmm8 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 ; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpackusdw %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpaddw %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm6 -; AVX1-NEXT: vpsllw $4, %xmm2, %xmm7 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm7 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm8 -; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm8 +; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm9 +; AVX1-NEXT: vpblendvb %xmm6, %xmm9, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm6 -; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm6 -; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 9ce682306f18b..15b3e9c43413c 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -318,20 +318,21 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubw %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubw %xmm1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm2 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm1, %xmm2 @@ -344,18 +345,19 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE41-LABEL: var_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: psubw %xmm1, %xmm2 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: psubw %xmm1, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE41-NEXT: pslld $23, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: cvttps2dq %xmm3, %xmm3 ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm1, %xmm0 @@ -367,7 +369,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -472,20 +474,21 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: psubw %xmm1, %xmm2 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; X86-SSE2-NEXT: pslld $23, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm4, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE2-NEXT: pslld $16, %xmm1 ; X86-SSE2-NEXT: psrad $16, %xmm1 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm3, %xmm2 +; X86-SSE2-NEXT: paddd %xmm4, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 @@ -1101,9 +1104,10 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: psrlw %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm2 @@ -1245,9 +1249,10 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psrlw %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 3d4f283260aa5..3e29915acd24d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -249,7 +249,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 @@ -265,7 +265,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 89330122fa239..b1a6fc82bd12f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -347,22 +347,22 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6],ymm5[7] -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7] +; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm4[0,0,3,3] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6],ymm5[7] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,u,2,u,u,u,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7] ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[1,u,u,u,6,u,u,u] -; AVX-NEXT: vbroadcastss 8(%rcx), %ymm6 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] +; AVX-NEXT: vbroadcastss 8(%rcx), %ymm5 +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5],ymm7[6],ymm0[7] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] ; AVX-NEXT: vbroadcastss 12(%rsi), %xmm2 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] ; AVX-NEXT: vmovaps %xmm1, 64(%r9) ; AVX-NEXT: vmovaps %ymm4, (%r9) ; AVX-NEXT: vmovaps %ymm0, 32(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 62e2aadd818c1..dca00c940516a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -9596,38 +9596,51 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i32_stride7_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX-NEXT: subq $3384, %rsp # imm = 0xD38 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rcx), %ymm5 -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%r8), %ymm4 +; AVX-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX-NEXT: vmovaps 224(%rsi), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rax), %ymm3 +; AVX-NEXT: vmovaps 224(%rdx), %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%rcx), %ymm6 +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%r8), %ymm8 +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%rax), %ymm7 +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[0,2] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm1 +; AVX-NEXT: vbroadcastss 228(%r8), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX-NEXT: vbroadcastss 228(%r9), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] -; AVX-NEXT: vmovaps %ymm1, %ymm5 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rax), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%r9), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%r8), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rdi), %xmm6 ; AVX-NEXT: vmovaps (%rsi), %xmm5 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm6[0] @@ -9641,8 +9654,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps (%r9), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%r8), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovaps (%rax), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] @@ -9658,23 +9677,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rsi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX-NEXT: vmovaps (%rdi), %ymm7 +; AVX-NEXT: vmovaps (%rsi), %ymm6 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm7[1,1],ymm6[5,5],ymm7[5,5] +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps (%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rcx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX-NEXT: vmovaps (%rdx), %ymm4 +; AVX-NEXT: vmovaps (%rcx), %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps (%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%r9), %ymm8 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] +; AVX-NEXT: vmovaps (%r8), %ymm5 +; AVX-NEXT: vmovaps (%r9), %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -9682,21 +9701,33 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm9[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,1] -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 16(%rax), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%r9), %xmm3 -; AVX-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -9710,32 +9741,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm6[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rcx), %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] -; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 32(%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%r9), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] +; AVX-NEXT: vmovaps 32(%r8), %ymm5 +; AVX-NEXT: vmovaps 32(%r9), %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -9743,17 +9773,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm7[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 48(%rax), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 64(%rcx), %xmm9 -; AVX-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rcx), %xmm7 +; AVX-NEXT: vmovaps 64(%rdx), %xmm8 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%r9), %xmm3 @@ -9771,32 +9813,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm9[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX-NEXT: vmovaps 64(%rcx), %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 64(%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%r9), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] +; AVX-NEXT: vmovaps 64(%r8), %ymm5 +; AVX-NEXT: vmovaps 64(%r9), %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -9804,24 +9845,36 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm5[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 80(%rax), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX-NEXT: vmovaps 96(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 96(%rcx), %xmm7 -; AVX-NEXT: vmovaps 96(%rdx), %xmm9 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX-NEXT: vmovaps 96(%rdx), %xmm8 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 96(%r9), %xmm4 +; AVX-NEXT: vmovaps 96(%r9), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 96(%r8), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 96(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9829,351 +9882,267 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX-NEXT: vmovaps 96(%rsi), %ymm6 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm7[1,1],ymm6[5,5],ymm7[5,5] +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX-NEXT: vmovaps 96(%rcx), %ymm9 +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm9[1,1],ymm8[5,5],ymm9[5,5] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX-NEXT: vmovaps 96(%r8), %ymm5 +; AVX-NEXT: vmovaps 96(%r9), %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,1],ymm2[6,4],ymm5[6,5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 96(%rax), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 96(%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%r9), %ymm14 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] -; AVX-NEXT: vmovaps 96(%rax), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %xmm5 -; AVX-NEXT: vmovaps 128(%rsi), %xmm4 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 128(%rcx), %xmm7 -; AVX-NEXT: vmovaps 128(%rdx), %xmm9 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm2[0,2],ymm4[5,5],ymm2[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX-NEXT: vmovaps 112(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rdi), %xmm7 +; AVX-NEXT: vmovaps 128(%rsi), %xmm6 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm7[0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,1] ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX-NEXT: vmovaps 128(%rcx), %xmm8 +; AVX-NEXT: vmovaps 128(%rdx), %xmm9 +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 128(%r9), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 128(%r8), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 128(%rax), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 128(%r9), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 128(%r8), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX-NEXT: vmovaps 128(%rax), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm5[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1],xmm2[0,2] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[1],xmm8[1],zero +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX-NEXT: vmovaps 128(%rsi), %ymm6 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm7[1,1],ymm6[5,5],ymm7[5,5] +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 128(%rdx), %ymm8 +; AVX-NEXT: vmovaps 128(%rcx), %ymm9 +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm9[1,1],ymm8[5,5],ymm9[5,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX-NEXT: vmovaps 128(%r8), %ymm5 +; AVX-NEXT: vmovaps 128(%r9), %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,1],ymm2[6,4],ymm5[6,5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rax), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 128(%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%r9), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] -; AVX-NEXT: vmovaps 128(%rax), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm2[0,2],ymm4[5,5],ymm2[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX-NEXT: vmovaps 144(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX-NEXT: vmovaps 160(%rdi), %xmm7 ; AVX-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm11[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 160(%rcx), %xmm7 -; AVX-NEXT: vmovaps 160(%rdx), %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm7[0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,1] +; AVX-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX-NEXT: vmovaps 160(%rcx), %xmm8 +; AVX-NEXT: vmovaps 160(%rdx), %xmm10 +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovaps 160(%r9), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 160(%r8), %xmm5 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 160(%rax), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX-NEXT: vmovaps 160(%rax), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm6[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm5[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm3[1],xmm7[1],zero -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1],xmm2[0,2] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm10[1],xmm8[1],zero +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rcx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 160(%r8), %ymm11 -; AVX-NEXT: vmovaps 160(%r9), %ymm13 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,1],ymm1[6,4],ymm11[6,5] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rax), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX-NEXT: vmovaps 160(%rsi), %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 160(%rdx), %ymm7 +; AVX-NEXT: vmovaps 160(%rcx), %ymm6 +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1],ymm6[1,1],ymm7[5,5],ymm6[5,5] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX-NEXT: vmovaps 160(%r8), %ymm5 +; AVX-NEXT: vmovaps 160(%r9), %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,1],ymm2[6,4],ymm5[6,5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 160(%rax), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm12 -; AVX-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm12[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,1] -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1],ymm2[0,2],ymm3[5,5],ymm2[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX-NEXT: vmovaps 176(%rax), %xmm4 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX-NEXT: vmovaps 192(%rsi), %xmm10 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm11[0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmovaps 192(%rcx), %xmm7 -; AVX-NEXT: vmovaps 192(%rdx), %xmm6 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX-NEXT: vmovaps 192(%rdx), %xmm8 +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 192(%r9), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 192(%r9), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 192(%r8), %xmm5 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 192(%rax), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX-NEXT: vmovaps 192(%rax), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm3[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm5[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm7[1],zero -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1],xmm2[0,2] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm8[1],xmm7[1],zero +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX-NEXT: vmovaps 192(%rsi), %ymm13 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm2[1,1],ymm13[5,5],ymm2[5,5] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 192(%rdx), %ymm12 +; AVX-NEXT: vmovaps 192(%rcx), %ymm7 +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[1,1],ymm7[1,1],ymm12[5,5],ymm7[5,5] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6],ymm1[7] +; AVX-NEXT: vmovaps 192(%r8), %ymm8 +; AVX-NEXT: vmovaps 192(%r9), %ymm14 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm8[0],ymm14[2],ymm8[2] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,1],ymm15[6,4],ymm8[6,5] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm15[2,3] +; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rsi), %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX-NEXT: vmovaps %ymm2, %ymm3 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] -; AVX-NEXT: vmovaps 192(%r8), %ymm1 -; AVX-NEXT: vmovaps 192(%r9), %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,1],ymm12[6,4],ymm1[6,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6],ymm0[7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 16(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 48(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 80(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm4[0,2],ymm14[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 112(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[0,2],ymm6[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 144(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm4[0,2],ymm13[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 176(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[6],ymm7[6],ymm12[7],ymm7[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm14[1],ymm8[3],ymm14[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[0,2],ymm14[5,5],ymm1[4,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX-NEXT: vmovaps 208(%rax), %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 208(%rax), %xmm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rsi), %xmm3 -; AVX-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm3[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX-NEXT: vmovaps 224(%rdx), %xmm12 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm12[1],xmm4[1],zero -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] -; AVX-NEXT: vbroadcastss 228(%r9), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] -; AVX-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm15[1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3],xmm14[3,3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -10183,9 +10152,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm12[3,3],ymm7[7,7],ymm12[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm7[3,3],ymm11[7,7],ymm7[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,3],ymm13[3,3],ymm3[7,7],ymm13[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vbroadcastss 220(%r8), %ymm1 @@ -10195,12 +10164,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastsd 216(%rax), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm8[0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vbroadcastss 224(%r9), %ymm1 @@ -10208,13 +10177,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastss 224(%rax), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -10224,51 +10193,39 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX-NEXT: vbroadcastss 236(%r8), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] -; AVX-NEXT: vbroadcastss 236(%r9), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vbroadcastsd 240(%r8), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX-NEXT: vbroadcastss 240(%r9), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] -; AVX-NEXT: vbroadcastss 240(%rax), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[2,2,2,2] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm8[0],ymm15[1],ymm8[1],ymm15[4],ymm8[4],ymm15[5],ymm8[5] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm14[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm8[3,3],ymm15[7,7],ymm8[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10289,21 +10246,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[2,2,2,2] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 40(%rax), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm15[3,3],ymm14[7,7],ymm15[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10324,21 +10290,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[2,2,2,2] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 72(%rax), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm15[3,3],ymm14[7,7],ymm15[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10359,20 +10334,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[2,2,2,2] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 104(%rax), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm15[3,3],ymm14[7,7],ymm15[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10393,260 +10378,213 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[2,2,2,2] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 136(%rax), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm15[3,3],ymm12[7,7],ymm15[7,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,1],ymm0[0,2],ymm9[7,5],ymm0[4,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,3],ymm15[3,3],ymm9[7,7],ymm15[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3],ymm1[1,2],ymm3[6,7],ymm1[5,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm0[3,3],mem[3,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[2,2,2,2] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] -; AVX-NEXT: vbroadcastsd 168(%rax), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX-NEXT: vbroadcastsd 168(%rax), %ymm15 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,1],ymm0[0,2],ymm6[7,5],ymm0[4,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm9[3,3],ymm0[7,7],ymm9[7,7] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX-NEXT: vunpcklps {{.*#+}} ymm15 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[4],ymm4[4],ymm9[5],ymm4[5] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,3],ymm1[3,3],ymm6[7,7],ymm1[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[3,3],ymm4[3,3],ymm9[7,7],ymm4[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm2[2,3],ymm15[1,2],ymm2[6,7],ymm15[5,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4],ymm15[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[3,3],xmm10[3,3] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX-NEXT: vbroadcastsd 200(%rax), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm9 +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6],ymm9[7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] +; AVX-NEXT: vbroadcastsd 200(%rax), %ymm11 +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm4[0,2],ymm7[7,5],ymm4[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3],xmm15[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm4[0,2],ymm11[7,5],ymm4[4,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX-NEXT: vbroadcastss 236(%r8), %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] +; AVX-NEXT: vbroadcastss 236(%r9), %xmm6 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,1],ymm11[1,1],ymm10[5,5],ymm11[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1],ymm12[1,1],ymm7[5,5],ymm12[5,5] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX-NEXT: vbroadcastsd 240(%r8), %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX-NEXT: vbroadcastss 240(%r9), %xmm7 +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX-NEXT: vbroadcastss 240(%rax), %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6,7] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vmovaps %ymm6, 1696(%rax) +; AVX-NEXT: vmovaps %ymm4, 1664(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps %ymm1, 1504(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[4],ymm11[4],ymm2[5],ymm11[5] -; AVX-NEXT: vmovaps %ymm12, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1],ymm12[0,2],ymm2[7,5],ymm12[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm2[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5] +; AVX-NEXT: vmovaps %ymm1, 1472(%rax) +; AVX-NEXT: vmovaps %ymm2, 1440(%rax) +; AVX-NEXT: vmovaps %ymm9, 1408(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm10[3,1],ymm15[0,2],ymm10[7,5],ymm15[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm13[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, 1376(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm13[0,2],ymm0[7,5],ymm13[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] -; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm7[3,3],xmm8[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps %ymm10, 1440(%rax) -; AVX-NEXT: vmovaps %ymm11, 1216(%rax) -; AVX-NEXT: vmovaps %ymm3, 992(%rax) -; AVX-NEXT: vmovaps %ymm1, 768(%rax) -; AVX-NEXT: vmovaps %ymm5, 544(%rax) -; AVX-NEXT: vmovaps %ymm6, 320(%rax) -; AVX-NEXT: vmovaps %ymm14, 96(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX-NEXT: vmovaps %ymm2, 1344(%rax) +; AVX-NEXT: vmovaps %ymm3, 1312(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, 1280(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, 1248(%rax) +; AVX-NEXT: vmovaps %ymm0, 1216(%rax) +; AVX-NEXT: vmovaps %ymm5, 1184(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX-NEXT: vmovaps %ymm0, 1152(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1312(%rax) +; AVX-NEXT: vmovaps %ymm0, 1120(%rax) +; AVX-NEXT: vmovaps %ymm8, 1088(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1280(%rax) +; AVX-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1184(%rax) +; AVX-NEXT: vmovaps %ymm0, 1024(%rax) +; AVX-NEXT: vmovaps %ymm14, 992(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1088(%rax) +; AVX-NEXT: vmovaps %ymm0, 960(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1056(%rax) +; AVX-NEXT: vmovaps %ymm0, 928(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 960(%rax) +; AVX-NEXT: vmovaps %ymm0, 896(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 864(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 832(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 736(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 640(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 608(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 512(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 416(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 384(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 288(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%rax) +; AVX-NEXT: vmovaps %ymm0, 800(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%rax) +; AVX-NEXT: vmovaps %ymm0, 768(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%rax) +; AVX-NEXT: vmovaps %ymm0, 736(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1696(%rax) +; AVX-NEXT: vmovaps %ymm0, 704(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1664(%rax) +; AVX-NEXT: vmovaps %ymm0, 672(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1472(%rax) +; AVX-NEXT: vmovaps %ymm0, 640(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1376(%rax) +; AVX-NEXT: vmovaps %ymm0, 608(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1344(%rax) +; AVX-NEXT: vmovaps %ymm0, 576(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1248(%rax) +; AVX-NEXT: vmovaps %ymm0, 544(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1152(%rax) +; AVX-NEXT: vmovaps %ymm0, 512(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1120(%rax) +; AVX-NEXT: vmovaps %ymm0, 480(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1024(%rax) +; AVX-NEXT: vmovaps %ymm0, 448(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 928(%rax) +; AVX-NEXT: vmovaps %ymm0, 416(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 896(%rax) +; AVX-NEXT: vmovaps %ymm0, 384(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 800(%rax) +; AVX-NEXT: vmovaps %ymm0, 352(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 704(%rax) +; AVX-NEXT: vmovaps %ymm0, 320(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 672(%rax) +; AVX-NEXT: vmovaps %ymm0, 288(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 576(%rax) +; AVX-NEXT: vmovaps %ymm0, 256(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 480(%rax) +; AVX-NEXT: vmovaps %ymm0, 224(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 448(%rax) +; AVX-NEXT: vmovaps %ymm0, 192(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 352(%rax) +; AVX-NEXT: vmovaps %ymm0, 160(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 256(%rax) +; AVX-NEXT: vmovaps %ymm0, 128(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rax) +; AVX-NEXT: vmovaps %ymm0, 96(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rax) +; AVX-NEXT: vmovaps %ymm0, 64(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10663,1031 +10601,969 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX-NEXT: addq $3384, %rsp # imm = 0xD38 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i32_stride7_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $2968, %rsp # imm = 0xB98 +; AVX2-NEXT: subq $2792, %rsp # imm = 0xAE8 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[1],zero +; AVX2-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm1 +; AVX2-NEXT: vbroadcastss 228(%r8), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 224(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rax), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%r8), %xmm13 -; AVX2-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%r8), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps (%r9), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero -; AVX2-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] +; AVX2-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rdx), %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm3[1],xmm1[1],zero +; AVX2-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovaps (%rdx), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovaps (%r8), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%r9), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3] +; AVX2-NEXT: vbroadcastsd %xmm4, %ymm4 ; AVX2-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2],xmm6[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm1[1],xmm0[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm4[2,2,2,2] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovaps 32(%r8), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%r9), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2],ymm11[3,4,5],ymm0[6],ymm11[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-NEXT: vmovaps 48(%rax), %xmm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps 64(%rax), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm1[1],xmm10[2,3] +; AVX2-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-NEXT: vmovaps 64(%rax), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm1[1],xmm0[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5],ymm11[6,7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm11 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm10[2,2,2,2] +; AVX2-NEXT: vmovaps 64(%rdx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%r8), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%r9), %xmm0 +; AVX2-NEXT: vmovaps 64(%rcx), %ymm10 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovaps 64(%r8), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2],ymm15[3,4,5],ymm0[6],ymm15[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-NEXT: vmovaps 80(%rax), %xmm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%r8), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps 96(%rax), %xmm1 +; AVX2-NEXT: vmovaps 96(%r9), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-NEXT: vmovaps 96(%rax), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%r8), %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps 128(%rax), %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-NEXT: vmovaps 96(%rdx), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovaps 96(%r8), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%r9), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vmovaps 112(%rax), %xmm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%r8), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%r9), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-NEXT: vmovaps 128(%rax), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 128(%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%r9), %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-NEXT: vmovaps 128(%rdx), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovaps 128(%r8), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%r9), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vmovaps 144(%rax), %xmm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%r8), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps 160(%rax), %xmm1 +; AVX2-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-NEXT: vmovaps 160(%rax), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r9), %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 192(%r8), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps 192(%rax), %xmm1 +; AVX2-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 192(%rsi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 192(%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 192(%rdx), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps (%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 16(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 48(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 80(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 96(%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 112(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 128(%rdx), %ymm1 +; AVX2-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 144(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-NEXT: vmovaps 160(%rdx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 160(%rcx), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovaps 160(%r8), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 160(%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 176(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vmovaps 176(%rax), %xmm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%r9), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 192(%r8), %xmm14 +; AVX2-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX2-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-NEXT: vmovaps 192(%rax), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm0[1],xmm1[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rcx), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 192(%r8), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovaps 208(%rax), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 192(%rsi), %ymm13 +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-NEXT: vmovaps 192(%rdx), %ymm14 +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%rcx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 224(%r8), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-NEXT: vbroadcastss %xmm3, %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX2-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vmovaps 224(%r8), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 232(%rax), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovaps 192(%r8), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 224(%rdx), %ymm12 -; AVX2-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] -; AVX2-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-NEXT: vmovaps 192(%r9), %ymm14 +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vmovaps 208(%rax), %xmm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 240(%rax), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-NEXT: vbroadcastss %xmm9, %xmm15 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm13, %xmm1 -; AVX2-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-NEXT: vbroadcastss %xmm8, %xmm7 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 40(%rax), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss %xmm1, %xmm13 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm0, %xmm14 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 224(%r8), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm10, %xmm7 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vbroadcastss %xmm15, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vbroadcastss 224(%rax), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6],ymm13[7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm5[3,3],xmm2[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6],ymm14[7] +; AVX2-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 72(%rax), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] +; AVX2-NEXT: vbroadcastss 232(%rax), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4],ymm13[5,6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 104(%rax), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovaps 224(%rcx), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 136(%rax), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm10, %xmm7 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6],ymm13[7] +; AVX2-NEXT: vbroadcastsd 240(%r8), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] +; AVX2-NEXT: vbroadcastss 240(%r9), %xmm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 240(%rax), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm4, %xmm13 +; AVX2-NEXT: vbroadcastss %xmm3, %xmm14 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 168(%rax), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm13, %xmm7 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 200(%rax), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 220(%r8), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-NEXT: vbroadcastss 220(%r9), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vbroadcastsd 216(%rax), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 240(%rdx), %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-NEXT: vbroadcastss 236(%r8), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] -; AVX2-NEXT: vmovaps 224(%rax), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm15 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6],ymm13[7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,3],xmm2[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6],ymm14[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] +; AVX2-NEXT: vbroadcastsd 8(%rax), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4],ymm13[5,6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 16(%rdx), %ymm13 +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm5[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0],ymm3[1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6],ymm3[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3,4,5,6],ymm5[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 252(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vbroadcastss 252(%r9), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vbroadcastsd 248(%rax), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm14[3,3],ymm15[7,7],ymm14[7,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm15, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm8, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3],xmm9[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX2-NEXT: vbroadcastsd 40(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vbroadcastss 48(%rdx), %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm3[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm7[1,1],ymm3[5,5],ymm7[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 112(%rax), %ymm2 +; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm5[3,3],ymm8[7,7],ymm5[7,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 144(%rax), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm5, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-NEXT: vbroadcastsd 72(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 80(%rdx), %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm4[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1],ymm10[1,1],ymm5[5,5],ymm10[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 176(%rax), %ymm2 +; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 208(%rax), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[6],ymm5[6],ymm10[7],ymm5[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm3[3,3],ymm4[7,7],ymm3[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-NEXT: vbroadcastsd 104(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 112(%rdx), %ymm0 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm3[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vbroadcastsd 112(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm4[3,3],ymm6[7,7],ymm4[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-NEXT: vbroadcastsd 136(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm3[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vbroadcastsd 144(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm4[3,3],ymm6[7,7],ymm4[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-NEXT: vbroadcastsd 168(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 176(%rdx), %ymm0 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm3[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm7[1,1],ymm6[5,5],ymm7[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vbroadcastsd 176(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastss 144(%rdx), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastss 176(%rdx), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm13, 1440(%rax) -; AVX2-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-NEXT: vmovaps %ymm2, 1216(%rax) -; AVX2-NEXT: vmovaps %ymm3, 1088(%rax) -; AVX2-NEXT: vmovaps %ymm4, 992(%rax) -; AVX2-NEXT: vmovaps %ymm5, 864(%rax) -; AVX2-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-NEXT: vmovaps %ymm8, 640(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 544(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-NEXT: vmovaps %ymm10, 96(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-NEXT: vmovaps %ymm9, 1472(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1280(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1248(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1056(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1024(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 832(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 608(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1760(%rax) +; AVX2-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm14, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-NEXT: vmovaps %xmm3, %xmm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-NEXT: vbroadcastsd 200(%rax), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vbroadcastss 208(%rdx), %ymm0 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm12[0],ymm2[1],ymm12[1],ymm2[4],ymm12[4],ymm2[5],ymm12[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm11[3,3],xmm5[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm14[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovaps %ymm12, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm12 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,1],ymm3[1,1],ymm0[5,5],ymm3[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vbroadcastsd 208(%rax), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6],ymm1[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm11 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 220(%r8), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX2-NEXT: vbroadcastss 220(%r9), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vbroadcastsd 216(%rax), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm0[1,2,3,4,5,6],ymm11[7] +; AVX2-NEXT: vbroadcastss 240(%rdx), %ymm11 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-NEXT: vbroadcastss 236(%r8), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6,7] +; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-NEXT: # xmm12 = mem[2,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-NEXT: # ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm14 = mem[1,2,2,3,5,6,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7] +; AVX2-NEXT: vmovaps 224(%rax), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 252(%r8), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-NEXT: vbroadcastss 252(%r9), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vbroadcastsd 248(%rax), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps %ymm12, 1760(%rax) ; AVX2-NEXT: vmovaps %ymm0, 1728(%rax) +; AVX2-NEXT: vmovaps %ymm11, 1664(%rax) +; AVX2-NEXT: vmovaps %ymm5, 1536(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1664(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1536(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX2-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-NEXT: vmovaps %ymm6, 1472(%rax) +; AVX2-NEXT: vmovaps %ymm7, 1440(%rax) +; AVX2-NEXT: vmovaps %ymm8, 1408(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1376(%rax) +; AVX2-NEXT: vmovaps %ymm9, 1344(%rax) +; AVX2-NEXT: vmovaps %ymm10, 1312(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 1344(%rax) +; AVX2-NEXT: vmovaps %ymm0, 1280(%rax) +; AVX2-NEXT: vmovaps %ymm13, 1248(%rax) +; AVX2-NEXT: vmovaps %ymm15, 1216(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1184(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11695,30 +11571,70 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1120(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 1088(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 1056(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 1024(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 992(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 928(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 864(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 832(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 800(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 768(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 736(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 640(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 608(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) @@ -11732,1031 +11648,969 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-NEXT: addq $2968, %rsp # imm = 0xB98 +; AVX2-NEXT: addq $2792, %rsp # imm = 0xAE8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i32_stride7_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $2968, %rsp # imm = 0xB98 +; AVX2-FP-NEXT: subq $2792, %rsp # imm = 0xAE8 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[1],zero +; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm1 +; AVX2-FP-NEXT: vbroadcastss 228(%r8), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%r8), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm3 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm3[1],xmm1[1],zero +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%r8), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm0 +; AVX2-FP-NEXT: vmovaps (%r9), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 96(%rax), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 128(%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%r9), %xmm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3] +; AVX2-FP-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm1 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2],xmm6[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 192(%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdx), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm1[1],xmm0[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 48(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm4[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2],ymm11[3,4,5],ymm0[6],ymm11[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 48(%rax), %xmm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm1[1],xmm10[2,3] +; AVX2-FP-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm1[1],xmm0[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm11 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm10[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm10 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 80(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2],ymm15[3,4,5],ymm0[6],ymm15[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 80(%rax), %xmm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-FP-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-FP-NEXT: vmovaps 96(%rax), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 112(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 112(%rax), %xmm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-FP-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 144(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 144(%rax), %xmm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%r8), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-FP-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 160(%rcx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm1 +; AVX2-FP-NEXT: vmovaps 160(%rcx), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 176(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 176(%rax), %xmm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm14 +; AVX2-FP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX2-FP-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm1[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2],xmm14[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm0[1],xmm1[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 208(%rax), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm13 +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm14 +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 224(%r8), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-FP-NEXT: vbroadcastss %xmm3, %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FP-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 232(%rax), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm12 -; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] -; AVX2-FP-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-FP-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm14 +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2],ymm14[3,4,5],ymm0[6],ymm14[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vmovaps 208(%rax), %xmm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 240(%rax), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm15 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps %xmm13, %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm7 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 40(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm13 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm14 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 224(%r8), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm7 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vbroadcastss %xmm15, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FP-NEXT: vbroadcastss 224(%rax), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm5[3,3],xmm2[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6],ymm14[7] +; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 72(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 232(%rax), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 104(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm7 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 136(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm7 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6],ymm13[7] +; AVX2-FP-NEXT: vbroadcastsd 240(%r8), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] +; AVX2-FP-NEXT: vbroadcastss 240(%r9), %xmm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 240(%rax), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm4, %xmm13 +; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm14 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 168(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm7 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 200(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 220(%r8), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FP-NEXT: vbroadcastss 220(%r9), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vbroadcastsd 216(%rax), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 240(%rdx), %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vbroadcastss 236(%r8), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm15 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6],ymm13[7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,3],xmm2[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6],ymm14[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] +; AVX2-FP-NEXT: vbroadcastsd 8(%rax), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4],ymm13[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 16(%rdx), %ymm13 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0],ymm3[1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6],ymm3[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 252(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vbroadcastss 252(%r9), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vbroadcastsd 248(%rax), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm14[3,3],ymm15[7,7],ymm14[7,7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm8, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3],xmm9[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd 40(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 48(%rdx), %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm3[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm7[1,1],ymm3[5,5],ymm7[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%rax), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm5[3,3],ymm8[7,7],ymm5[7,7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 144(%rax), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm5, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd 72(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 80(%rdx), %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm4[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1],ymm10[1,1],ymm5[5,5],ymm10[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 176(%rax), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 208(%rax), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[6],ymm5[6],ymm10[7],ymm5[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm3[3,3],ymm4[7,7],ymm3[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd 104(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 112(%rdx), %ymm0 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm3[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastsd 112(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm4[3,3],ymm6[7,7],ymm4[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd 136(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm3[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastsd 144(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm4[3,3],ymm6[7,7],ymm4[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd 168(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 176(%rdx), %ymm0 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm3[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm7[1,1],ymm6[5,5],ymm7[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastsd 176(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm14, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FP-NEXT: vmovaps %xmm3, %xmm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd 200(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vbroadcastss 208(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm12[0],ymm2[1],ymm12[1],ymm2[4],ymm12[4],ymm2[5],ymm12[5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastss 176(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm11[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm14[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovaps %ymm12, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm12 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,1],ymm3[1,1],ymm0[5,5],ymm3[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastsd 208(%rax), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 220(%r8), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastss 220(%r9), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vbroadcastsd 216(%rax), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0],ymm0[1,2,3,4,5,6],ymm11[7] +; AVX2-FP-NEXT: vbroadcastss 240(%rdx), %ymm11 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vbroadcastss 236(%r8), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm12 = mem[2,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FP-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm14 = mem[1,2,2,3,5,6,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7] +; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 252(%r8), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FP-NEXT: vbroadcastss 252(%r9), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vbroadcastsd 248(%rax), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm13, 1440(%rax) -; AVX2-FP-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, 1216(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 1088(%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, 992(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 864(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-FP-NEXT: vmovaps %ymm8, 640(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-FP-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm10, 96(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 1472(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1248(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1024(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1760(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm12, 1760(%rax) ; AVX2-FP-NEXT: vmovaps %ymm0, 1728(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 1664(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 1536(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1664(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1536(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-FP-NEXT: vmovaps %ymm6, 1472(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, 1440(%rax) +; AVX2-FP-NEXT: vmovaps %ymm8, 1408(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1376(%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 1344(%rax) +; AVX2-FP-NEXT: vmovaps %ymm10, 1312(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rax) +; AVX2-FP-NEXT: vmovaps %ymm13, 1248(%rax) +; AVX2-FP-NEXT: vmovaps %ymm15, 1216(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12764,30 +12618,70 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1120(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 1088(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 1024(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 992(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 928(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 864(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 736(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) @@ -12801,1066 +12695,1033 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-FP-NEXT: addq $2968, %rsp # imm = 0xB98 +; AVX2-FP-NEXT: addq $2792, %rsp # imm = 0xAE8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i32_stride7_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX2-FCP-NEXT: subq $2808, %rsp # imm = 0xAF8 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 +; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2 +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[1],zero +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm1 +; AVX2-FCP-NEXT: vbroadcastss 228(%r8), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm13 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps (%r8), %xmm6 +; AVX2-FCP-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm5[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm2[1],xmm1[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 128(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm1[2],ymm11[3,4,5],ymm1[6],ymm11[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 48(%rax), %xmm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm2[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2],xmm11[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 192(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm2[1],xmm1[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm15 -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm12 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 48(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 80(%rax), %xmm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm2[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2],xmm14[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm2[1],xmm1[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 80(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm1 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 112(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 112(%rax), %xmm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm2[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2],xmm14[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 128(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm2[1],xmm1[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r9), %ymm1 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 144(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 128(%r9), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 144(%rax), %xmm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm2[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2],xmm14[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm2[1],xmm1[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm2 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 176(%rax), %xmm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2],xmm14[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm0[1],xmm1[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm2[3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 176(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm12 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm8 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm1 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm14 +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovaps 208(%rax), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm14 +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FCP-NEXT: vmovaps 208(%rax), %xmm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm6 -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 228(%r8), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm14 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2] +; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm11, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 224(%r8), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vbroadcastss %xmm13, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FCP-NEXT: vbroadcastss 224(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3],xmm0[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm11, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6],ymm15[7] +; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 232(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm15[4],ymm14[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm5 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm9, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 224(%r8), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vbroadcastss %xmm4, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FCP-NEXT: vbroadcastss 224(%rax), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 232(%rax), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7] -; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%r9), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 216(%rax), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 240(%rdx), %ymm11 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 236(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] -; AVX2-FCP-NEXT: vpermps 224(%r9), %ymm11, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] +; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 240(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm14 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm15 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm11, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm5, %ymm13 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm13 = xmm3[3,3],xmm0[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FCP-NEXT: vpermps %ymm14, %ymm11, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6],ymm14[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm6[0,1,2],xmm14[3] +; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4],ymm13[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm13 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3],xmm4[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1],ymm8[1,1],ymm2[5,5],ymm8[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6],ymm4[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[6],ymm2[6],ymm8[7],ymm2[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 252(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vbroadcastss 252(%r9), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 248(%rax), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm0 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,3],ymm14[3,3],ymm15[7,7],ymm14[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm15, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm14, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm8, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm13[3,3],xmm9[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 40(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm7[1,1],ymm4[5,5],ymm7[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[6],ymm4[6],ymm7[7],ymm4[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3],xmm7[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1],ymm12[1,1],ymm7[5,5],ymm12[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[6],ymm7[6],ymm12[7],ymm7[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,3],ymm5[3,3],ymm6[7,7],ymm5[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm7, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastsd 40(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm5[3,3],ymm7[7,7],ymm5[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm7, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 136(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 144(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 144(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm5[3,3],ymm7[7,7],ymm5[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm7, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastsd 136(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 168(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 144(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vbroadcastss 176(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 176(%rax), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm5[3,3],ymm9[7,7],ymm5[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm1 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm6[3,3],xmm5[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastsd 168(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 176(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm0 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 208(%rdx), %ymm0 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3],xmm4[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm15[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovaps %ymm13, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,1],ymm3[1,1],ymm0[5,5],ymm3[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] ; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[6],ymm13[6],ymm15[7],ymm13[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%r8), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%r9), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 216(%rax), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm0[1,2,3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vbroadcastss 240(%rdx), %ymm12 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6],ymm13[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm8[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 144(%rdx), %ymm10 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vbroadcastss 236(%r8), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm13 = mem[2,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm8[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] +; AVX2-FCP-NEXT: vpermps 224(%r9), %ymm15, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 176(%rdx), %ymm14 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm8[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vbroadcastss 208(%rdx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm8 = xmm1[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 252(%r8), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FCP-NEXT: vbroadcastss 252(%r9), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 248(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm4, 1440(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm14, 1216(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm13, 1088(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm10, 992(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 864(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 768(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, 640(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 544(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm13, 1760(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 1728(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm12, 1664(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 1536(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 1472(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, 1440(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm8, 1408(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1472(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1408(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 1376(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 1344(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm10, 1312(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm14, 1248(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1248(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 1216(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1120(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 1088(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1024(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 992(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 736(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1760(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1728(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1664(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1536(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1376(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1696(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1632(%rax) @@ -13868,7 +13729,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 1568(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX2-FCP-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX2-FCP-NEXT: addq $2808, %rsp # imm = 0xAF8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index a9da7abaa945c..866cd287dbcf0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -3430,9 +3430,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vmovdqa 16(%r10), %xmm11 +; AVX512-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX512-NEXT: vmovdqa (%rax), %xmm5 -; AVX512-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX512-NEXT: vmovdqa 16(%rax), %xmm13 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] @@ -3444,114 +3444,117 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %xmm7 ; AVX512-NEXT: vmovdqa 16(%r8), %xmm15 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm4 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7,8],ymm13[9],ymm3[10,11,12],ymm13[13],ymm3[14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm4, %ymm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm11 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7,8],ymm1[9],ymm11[10,11,12],ymm1[13],ymm11[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm18 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7],ymm1[8,9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm1, %ymm19 -; AVX512-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm19 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5,6],ymm1[7],ymm11[8,9,10],ymm1[11],ymm11[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX512-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm18 +; AVX512-NEXT: vmovdqa 16(%rdx), %xmm11 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm10, %ymm4, %ymm16 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4],ymm10[5],ymm3[6,7,8],ymm10[9],ymm3[10,11,12],ymm10[13],ymm3[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm16 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7],ymm12[8,9,10],ymm3[11],ymm12[12,13,14],ymm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5,6],ymm0[7],ymm10[8,9,10],ymm0[11],ymm10[12,13,14],ymm0[15] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm12 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] @@ -3582,8 +3585,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] @@ -3594,14 +3597,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3774,160 +3777,165 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm10 ; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 -; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm11 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm11 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm12 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm13 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4],ymm0[5],ymm14[6,7,8],ymm0[9],ymm14[10,11,12],ymm0[13],ymm14[14,15] ; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm16 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm16 {%k1} -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm17 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm18 -; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm16 {%k1} +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm20 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm5, %ymm18 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm12, %ymm19 -; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm15 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm20 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21 -; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7,8],ymm12[9],ymm2[10,11,12],ymm12[13],ymm2[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm5, %ymm19 +; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm13, %ymm9 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7,8],ymm1[9],ymm4[10,11,12],ymm1[13],ymm4[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0,0,1,1,2,2,3,3] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7,8],ymm9[9],ymm5[10,11,12],ymm9[13],ymm5[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm10 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm7 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] @@ -3936,13 +3944,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm4 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index a166bebae721c..57bca8f4ee3e0 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -1393,29 +1393,29 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: mul_v2i64_neg_15_63: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE2-NEXT: paddq %xmm1, %xmm2 -; X86-SSE2-NEXT: psllq $32, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlq $32, %xmm3 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: paddq %xmm3, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 ; X86-SSE2-NEXT: paddq %xmm2, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-SSE4-LABEL: mul_v2i64_neg_15_63: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] ; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: psrlq $32, %xmm2 -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553] -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE4-NEXT: paddq %xmm1, %xmm2 -; X86-SSE4-NEXT: psllq $32, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: psrlq $32, %xmm3 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: paddq %xmm3, %xmm0 +; X86-SSE4-NEXT: psllq $32, %xmm0 ; X86-SSE4-NEXT: paddq %xmm2, %xmm0 ; X86-SSE4-NEXT: retl ; @@ -1482,29 +1482,29 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: mul_v2i64_neg_17_65: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE2-NEXT: paddq %xmm1, %xmm2 -; X86-SSE2-NEXT: psllq $32, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlq $32, %xmm3 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: paddq %xmm3, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 ; X86-SSE2-NEXT: paddq %xmm2, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-SSE4-LABEL: mul_v2i64_neg_17_65: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] ; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: psrlq $32, %xmm2 -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551] -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE4-NEXT: paddq %xmm1, %xmm2 -; X86-SSE4-NEXT: psllq $32, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: psrlq $32, %xmm3 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: paddq %xmm3, %xmm0 +; X86-SSE4-NEXT: psllq $32, %xmm0 ; X86-SSE4-NEXT: paddq %xmm2, %xmm0 ; X86-SSE4-NEXT: retl ; @@ -2033,8 +2033,8 @@ define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) { ; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll index 5b43acbe52375..084a1faa516fe 100644 --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -473,9 +473,10 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1749,7 +1750,7 @@ define <16 x i1> @is_positive_mask_v16i16_v16i1(<16 x i16> %x, <16 x i1> %y) { ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index 008e3e4c217cb..b0ad27e3a790a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -1087,11 +1087,13 @@ define double @test_v4f64(<4 x double> %a0) { ; ; SSE41-LABEL: test_v4f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: maxpd %xmm2, %xmm3 +; SSE41-NEXT: maxpd %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 @@ -1271,45 +1273,45 @@ define double @test_v8f64(<8 x double> %a0) { ; ; SSE41-LABEL: test_v8f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: maxpd %xmm6, %xmm5 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: maxpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: movaps %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: maxpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: maxpd %xmm3, %xmm4 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: maxpd %xmm2, %xmm3 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm1 -; SSE41-NEXT: maxpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm5, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: testq %rax, %rax ; SSE41-NEXT: js .LBB7_1 ; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: jmp .LBB7_3 ; SSE41-NEXT: .LBB7_1: ; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: .LBB7_3: ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: maxsd %xmm0, %xmm1 @@ -1568,82 +1570,81 @@ define double @test_v16f64(<16 x double> %a0) { ; ; SSE41-LABEL: test_v16f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm10 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE41-NEXT: movaps %xmm0, %xmm8 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: movaps %xmm3, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: maxpd %xmm10, %xmm9 +; SSE41-NEXT: movapd %xmm7, %xmm3 +; SSE41-NEXT: maxpd %xmm9, %xmm3 ; SSE41-NEXT: movapd %xmm7, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movapd %xmm8, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: maxpd %xmm7, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: maxpd %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm5, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: maxpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm3 -; SSE41-NEXT: maxpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm9, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm2, %xmm5 -; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm2 -; SSE41-NEXT: maxpd %xmm5, %xmm2 +; SSE41-NEXT: maxpd %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm6, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm1 -; SSE41-NEXT: maxpd %xmm5, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: movaps %xmm8, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: maxpd %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: maxpd %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: maxpd %xmm3, %xmm4 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: maxpd %xmm2, %xmm3 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: maxpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: testq %rax, %rax ; SSE41-NEXT: js .LBB8_1 ; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: jmp .LBB8_3 ; SSE41-NEXT: .LBB8_1: ; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: .LBB8_3: ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: maxsd %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index ab95081e2938e..5e3ef32ef7e4a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -19,7 +19,8 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -33,7 +34,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1OR2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1OR2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1OR2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1OR2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -47,7 +48,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -61,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -106,7 +107,8 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -129,7 +131,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -153,7 +155,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -177,7 +179,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -201,7 +203,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -272,7 +274,8 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -312,7 +315,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -344,7 +347,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -377,7 +380,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -410,7 +413,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -524,7 +527,8 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -598,7 +602,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -646,7 +650,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -687,7 +691,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -728,7 +732,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -804,8 +808,10 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i32: @@ -832,15 +838,21 @@ define i32 @test_v4i32(<4 x i32> %a0) { define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-LABEL: test_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE2-NEXT: pmuludq %xmm3, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; @@ -898,18 +910,32 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pmuludq %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq @@ -974,34 +1000,64 @@ define i32 @test_v16i32(<16 x i32> %a0) { define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-LABEL: test_v32i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm10 -; SSE2-NEXT: pmuludq %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm11 -; SSE2-NEXT: pmuludq %xmm9, %xmm11 -; SSE2-NEXT: pmuludq %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE2-NEXT: pmuludq %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] -; SSE2-NEXT: pmuludq %xmm11, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll index 322fdde106dcf..95aea6f524023 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -42,10 +42,12 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax @@ -133,22 +135,26 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pmovsxdq %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -289,10 +295,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -300,10 +308,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm0 @@ -311,22 +321,26 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: xorpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq @@ -544,10 +558,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm10 ; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3] ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm7, %xmm0 @@ -555,10 +571,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm3, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -566,10 +584,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 @@ -577,10 +597,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm0 @@ -588,10 +610,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm4, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm7, %xmm0 @@ -599,10 +623,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 ; SSE41-NEXT: movapd %xmm7, %xmm0 @@ -610,22 +636,26 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm6, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; SSE41-NEXT: movdqa %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll index bb87740c21538..f86d3c1dd6e3f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -37,14 +37,17 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax @@ -132,10 +135,12 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] @@ -143,10 +148,12 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax @@ -289,10 +296,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm3, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -300,10 +309,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -311,10 +322,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: xorpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] @@ -322,10 +335,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pmovsxdq %xmm5, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax @@ -544,10 +559,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm6, %xmm10 ; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3] ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm8, %xmm0 @@ -555,10 +572,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -566,10 +585,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -577,10 +598,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE41-NEXT: movapd %xmm5, %xmm0 @@ -588,10 +611,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm7, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 ; SSE41-NEXT: movapd %xmm4, %xmm0 @@ -599,10 +624,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm6, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm0 @@ -610,10 +637,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm7, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] @@ -621,11 +650,13 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 ; SSE41-NEXT: movdqa %xmm9, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pmovsxdq %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index b355c3dee5309..2f3c1e09ea78e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -42,10 +42,12 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax @@ -151,22 +153,26 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pmovsxdq %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -324,10 +330,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -335,10 +343,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm0 @@ -346,22 +356,26 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: xorpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq @@ -607,10 +621,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm10 ; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3] ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm7, %xmm0 @@ -618,10 +634,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm3, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -629,10 +647,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 @@ -640,10 +660,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm0 @@ -651,10 +673,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm4, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm7, %xmm0 @@ -662,10 +686,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 ; SSE41-NEXT: movapd %xmm7, %xmm0 @@ -673,22 +699,26 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm6, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; SSE41-NEXT: movdqa %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 2d68cf9d6374d..dc8d0d53a91a1 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -37,14 +37,17 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax @@ -150,10 +153,12 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] @@ -161,10 +166,12 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pmovsxdq %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax @@ -325,10 +332,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm3, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -336,10 +345,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 @@ -347,10 +358,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: xorpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] @@ -358,10 +371,12 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pmovsxdq %xmm5, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax @@ -610,10 +625,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm6, %xmm10 ; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3] ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm8, %xmm0 @@ -621,10 +638,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -632,10 +651,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -643,10 +664,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE41-NEXT: movapd %xmm5, %xmm0 @@ -654,10 +677,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm7, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 ; SSE41-NEXT: movapd %xmm4, %xmm0 @@ -665,10 +690,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm6, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm0 @@ -676,10 +703,12 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: movapd %xmm7, %xmm1 ; SSE41-NEXT: xorpd %xmm9, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] @@ -687,11 +716,13 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 ; SSE41-NEXT: movdqa %xmm9, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pmovsxdq %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 993e6afc0eaf3..1a8aa809e5db5 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -236,21 +236,22 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-LABEL: var_rotate_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2 ; SSE2-NEXT: pmullw %xmm1, %xmm0 @@ -260,15 +261,16 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE41-LABEL: var_rotate_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: pslld $23, %xmm3 +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmulhuw %xmm2, %xmm1 @@ -279,7 +281,8 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX1-LABEL: var_rotate_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -383,17 +386,18 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-SSE2-LABEL: var_rotate_v8i16: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm4, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; X86-SSE2-NEXT: pslld $23, %xmm1 -; X86-SSE2-NEXT: paddd %xmm3, %xmm1 +; X86-SSE2-NEXT: paddd %xmm4, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE2-NEXT: pslld $16, %xmm1 ; X86-SSE2-NEXT: psrad $16, %xmm1 @@ -929,18 +933,32 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { } define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { -; SSE-LABEL: splatvar_rotate_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE-NEXT: psllw %xmm1, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psllw %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_rotate_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_rotate_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_rotate_v16i8: ; AVX: # %bb.0: @@ -981,9 +999,10 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; X86-SSE2-LABEL: splatvar_rotate_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psllw %xmm1, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index c2c6a5f7eba57..32e03e0a93d70 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -175,28 +175,29 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6 -; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7 +; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 2b1cf5b671e53..85f38deed700d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -157,42 +157,45 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-LABEL: var_shift_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: pslld $23, %xmm3 +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -256,17 +259,18 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; X86-SSE-LABEL: var_shift_v8i16: ; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; X86-SSE-NEXT: pslld $23, %xmm2 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE-NEXT: paddd %xmm3, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE-NEXT: paddd %xmm4, %xmm2 ; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE-NEXT: pslld $16, %xmm2 ; X86-SSE-NEXT: psrad $16, %xmm2 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; X86-SSE-NEXT: pslld $23, %xmm1 -; X86-SSE-NEXT: paddd %xmm3, %xmm1 +; X86-SSE-NEXT: paddd %xmm4, %xmm1 ; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE-NEXT: pslld $16, %xmm1 ; X86-SSE-NEXT: psrad $16, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 4f55f7af20f47..0dbbe42268015 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -158,25 +158,26 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-LABEL: var_shift_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -247,19 +248,20 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; X86-AVX1-LABEL: var_shift_v16i16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7] -; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm5 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 -; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 +; X86-AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm5 +; X86-AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 -; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4,4,5,5,6,6,7,7] +; X86-AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; X86-AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; X86-AVX1-NEXT: vpslld $23, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 ; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index d245bdca6ee29..3996d7f09f01c 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -93,42 +93,45 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: var_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: pslld $23, %xmm3 +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -192,17 +195,18 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; ; X86-SSE-LABEL: var_shift_v4i16: ; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; X86-SSE-NEXT: pslld $23, %xmm2 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE-NEXT: paddd %xmm3, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE-NEXT: paddd %xmm4, %xmm2 ; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE-NEXT: pslld $16, %xmm2 ; X86-SSE-NEXT: psrad $16, %xmm2 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; X86-SSE-NEXT: pslld $23, %xmm1 -; X86-SSE-NEXT: paddd %xmm3, %xmm1 +; X86-SSE-NEXT: paddd %xmm4, %xmm1 ; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE-NEXT: pslld $16, %xmm1 ; X86-SSE-NEXT: psrad $16, %xmm1 @@ -216,42 +220,45 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE2-LABEL: var_shift_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: pslld $23, %xmm3 +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 ; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 @@ -315,17 +322,18 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; ; X86-SSE-LABEL: var_shift_v2i16: ; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; X86-SSE-NEXT: pslld $23, %xmm2 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE-NEXT: paddd %xmm3, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE-NEXT: paddd %xmm4, %xmm2 ; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE-NEXT: pslld $16, %xmm2 ; X86-SSE-NEXT: psrad $16, %xmm2 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; X86-SSE-NEXT: pslld $23, %xmm1 -; X86-SSE-NEXT: paddd %xmm3, %xmm1 +; X86-SSE-NEXT: paddd %xmm4, %xmm1 ; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1 ; X86-SSE-NEXT: pslld $16, %xmm1 ; X86-SSE-NEXT: psrad $16, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index dbbfaab9ea26a..4687530645cec 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -6152,9 +6152,9 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -6176,9 +6176,9 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6334,9 +6334,9 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -6359,9 +6359,9 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6518,7 +6518,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq @@ -6542,7 +6542,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq @@ -6596,7 +6596,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq @@ -6621,7 +6621,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 2df013d0ff3e3..de6e79550e869 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -377,9 +377,9 @@ define void @PR39483() { ; X86-AVX1-NEXT: vmovups 64, %ymm1 ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7] -; X86-AVX1-NEXT: vmovups 16, %xmm2 -; X86-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] ; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -417,9 +417,9 @@ define void @PR39483() { ; X64-AVX1-NEXT: vmovups 64, %ymm1 ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7] -; X64-AVX1-NEXT: vmovups 16, %xmm2 -; X64-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; X64-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 12d494c32b656..02ff8a33cfbd3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -851,23 +851,20 @@ define <16 x i8> @constant_fold_pshufb_2() { define i32 @mask_zzz3_v16i8(<16 x i8> %a0) { ; SSSE3-LABEL: mask_zzz3_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_zzz3_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] ; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSE41-NEXT: retq ; ; AVX-LABEL: mask_zzz3_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] ; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = bitcast <16 x i8> %1 to <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 68040b58858a7..3598d0bdd293f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2854,13 +2854,16 @@ define <4 x float> @PR30264(<4 x float> %x) { define <8 x i16> @PR39549(<16 x i8> %x) { ; SSE-LABEL: PR39549: ; SSE: # %bb.0: -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: PR39549: ; AVX: # %bb.0: -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX-NEXT: retq %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index da8a3f3fa0d4e..9481d9ae70471 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -56,25 +56,28 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: trunc_packus_v2i64_v2i32: @@ -174,25 +177,28 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; @@ -314,51 +320,57 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_packus_v4i64_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm7 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i32: @@ -579,91 +591,106 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-LABEL: trunc_packus_v8i64_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm5 -; SSE41-NEXT: movdqa 16(%rdi), %xmm8 -; SSE41-NEXT: movdqa 32(%rdi), %xmm7 -; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: movdqa 16(%rdi), %xmm9 +; SSE41-NEXT: movdqa 32(%rdi), %xmm8 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm7 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm2 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: xorpd %xmm3, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: xorpd %xmm3, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: xorpd %xmm3, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 @@ -784,25 +811,28 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE41-NEXT: retq ; @@ -927,25 +957,28 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE41-NEXT: movd %xmm0, (%rdi) ; SSE41-NEXT: retq @@ -1097,52 +1130,58 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_packus_v4i64_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i16: @@ -1288,51 +1327,57 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-LABEL: trunc_packus_v4i64_v4i16_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: movq %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i16_store: @@ -1535,94 +1580,109 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-LABEL: trunc_packus_v8i64_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm7 -; SSE41-NEXT: movdqa 16(%rdi), %xmm5 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm8 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm5 +; SSE41-NEXT: movdqa 48(%rdi), %xmm9 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 ; SSE41-NEXT: packusdw %xmm6, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: packusdw %xmm6, %xmm5 -; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2188,28 +2248,31 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; ; SSE41-LABEL: trunc_packus_v2i64_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -2344,28 +2407,31 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $0, %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: trunc_packus_v2i64_v2i8_store: @@ -2490,53 +2556,59 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_packus_v4i64_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i8: @@ -2685,52 +2757,58 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-LABEL: trunc_packus_v4i64_v4i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, (%rdi) +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm3 +; SSE41-NEXT: movd %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i8_store: @@ -2936,94 +3014,109 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-LABEL: trunc_packus_v8i64_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm7 -; SSE41-NEXT: movdqa 16(%rdi), %xmm5 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm8 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm5 +; SSE41-NEXT: movdqa 48(%rdi), %xmm9 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 ; SSE41-NEXT: packusdw %xmm6, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: packusdw %xmm6, %xmm5 -; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -3223,96 +3316,111 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm7 -; SSE41-NEXT: movdqa 16(%rdi), %xmm5 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: xorpd %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: packusdw %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: xorpd %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 +; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: xorpd %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 ; SSE41-NEXT: packusdw %xmm6, %xmm5 -; SSE41-NEXT: packusdw %xmm5, %xmm4 -; SSE41-NEXT: packuswb %xmm4, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rsi) +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: packusdw %xmm6, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm5 +; SSE41-NEXT: packuswb %xmm5, %xmm5 +; SSE41-NEXT: movq %xmm5, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: @@ -3615,183 +3723,214 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; ; SSE41-LABEL: trunc_packus_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm12 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 -; SSE41-NEXT: movdqa 80(%rdi), %xmm10 -; SSE41-NEXT: movdqa 64(%rdi), %xmm6 -; SSE41-NEXT: movdqa 112(%rdi), %xmm5 -; SSE41-NEXT: movdqa 96(%rdi), %xmm4 +; SSE41-NEXT: movdqa (%rdi), %xmm9 +; SSE41-NEXT: movdqa 16(%rdi), %xmm8 +; SSE41-NEXT: movdqa 32(%rdi), %xmm13 +; SSE41-NEXT: movdqa 48(%rdi), %xmm12 +; SSE41-NEXT: movdqa 80(%rdi), %xmm11 +; SSE41-NEXT: movdqa 64(%rdi), %xmm7 +; SSE41-NEXT: movdqa 112(%rdi), %xmm6 +; SSE41-NEXT: movdqa 96(%rdi), %xmm5 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pxor %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm2, %xmm12 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm11 -; SSE41-NEXT: pxor %xmm2, %xmm11 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm7 +; SSE41-NEXT: movdqa %xmm13, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm11 +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm7, %xmm7 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: xorpd %xmm2, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm13 +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm12 +; SSE41-NEXT: movdqa %xmm10, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm12 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm8, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: movapd %xmm11, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm12, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm1 ; SSE41-NEXT: packusdw %xmm8, %xmm1 -; SSE41-NEXT: movapd %xmm12, %xmm8 -; SSE41-NEXT: xorpd %xmm2, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm13, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 -; SSE41-NEXT: movapd %xmm10, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm8 +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9 ; SSE41-NEXT: packusdw %xmm8, %xmm9 ; SSE41-NEXT: packusdw %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm8 -; SSE41-NEXT: xorpd %xmm2, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: packusdw %xmm8, %xmm7 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: packusdw %xmm8, %xmm6 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: xorpd %xmm2, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: packusdw %xmm5, %xmm7 -; SSE41-NEXT: packusdw %xmm7, %xmm6 -; SSE41-NEXT: packuswb %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm7 +; SSE41-NEXT: packuswb %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index d0cdbf1e3f08d..5bfe2c1702880 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -58,23 +58,26 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4294967295,0,4294967295] -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq @@ -181,23 +184,26 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4294967295,0,4294967295] -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: movq %xmm0, (%rdi) @@ -334,45 +340,51 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm7 = [4294967295,0,4294967295,0] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [0,4294967295,0,4294967295] -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm7 = [0,4294967295,0,4294967295] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; SSE41-NEXT: movaps %xmm1, %xmm0 @@ -604,88 +616,101 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: xorpd %xmm3, %xmm7 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [0,4294967295,0,4294967295] -; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm7 = [0,4294967295,0,4294967295] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: xorpd %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 ; SSE41-NEXT: xorpd %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -805,23 +830,26 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -939,23 +967,26 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -1106,45 +1137,51 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: packssdw %xmm2, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm1 @@ -1290,45 +1327,51 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: packssdw %xmm2, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm1 @@ -1530,94 +1573,107 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848] -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: xorpd %xmm2, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packssdw %xmm7, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: xorpd %xmm2, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: packssdw %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: packssdw %xmm7, %xmm5 +; SSE41-NEXT: packssdw %xmm8, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1955,23 +2011,26 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2101,23 +2160,26 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) @@ -2242,45 +2304,51 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: packssdw %xmm2, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm1 @@ -2430,45 +2498,51 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: packssdw %xmm2, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm1 @@ -2674,94 +2748,107 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: xorpd %xmm2, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packssdw %xmm7, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: xorpd %xmm2, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: packssdw %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: packssdw %xmm7, %xmm5 +; SSE41-NEXT: packssdw %xmm8, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm1 ; SSE41-NEXT: packsswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2967,94 +3054,107 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm3 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: xorpd %xmm1, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm9 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: xorpd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: packssdw %xmm7, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm7 -; SSE41-NEXT: xorpd %xmm1, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: packssdw %xmm9, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: packssdw %xmm7, %xmm5 +; SSE41-NEXT: packssdw %xmm8, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm4 ; SSE41-NEXT: packsswb %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rsi) @@ -3370,177 +3470,202 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm8 ; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm12 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 +; SSE41-NEXT: movdqa 32(%rdi), %xmm13 +; SSE41-NEXT: movdqa 48(%rdi), %xmm12 ; SSE41-NEXT: movdqa 80(%rdi), %xmm10 ; SSE41-NEXT: movdqa 64(%rdi), %xmm6 ; SSE41-NEXT: movdqa 112(%rdi), %xmm5 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm11, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm11, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm5 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pxor %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm13, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm10 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm2, %xmm12 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm13 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm9, %xmm14 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm15 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3] +; SSE41-NEXT: por %xmm15, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm11 -; SSE41-NEXT: pxor %xmm2, %xmm11 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm12 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm1, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm9, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movapd %xmm11, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 -; SSE41-NEXT: packssdw %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm12, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm9 -; SSE41-NEXT: movapd %xmm10, %xmm11 -; SSE41-NEXT: xorpd %xmm2, %xmm11 -; SSE41-NEXT: movapd %xmm11, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm12 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 -; SSE41-NEXT: packssdw %xmm9, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE41-NEXT: movapd %xmm12, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm1 ; SSE41-NEXT: packssdw %xmm11, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm10 +; SSE41-NEXT: movapd %xmm13, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11 +; SSE41-NEXT: movapd %xmm10, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm12 +; SSE41-NEXT: packssdw %xmm11, %xmm12 +; SSE41-NEXT: packssdw %xmm12, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: packssdw %xmm9, %xmm6 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: xorpd %xmm2, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: packssdw %xmm10, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE41-NEXT: packssdw %xmm5, %xmm7 ; SSE41-NEXT: packssdw %xmm7, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 65916aaf52f9e..742ad88593ca1 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -40,10 +40,11 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -116,10 +117,11 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -207,20 +209,21 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -401,51 +404,54 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v8i64_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm4 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm7 ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2] -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i32: @@ -538,10 +544,11 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -632,10 +639,11 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -748,26 +756,27 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm5, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -880,29 +889,30 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-LABEL: trunc_usat_v4i64_v4i16_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rdi) +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm5, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: movq %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16_store: @@ -1045,52 +1055,55 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i16: @@ -1634,10 +1647,11 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -1729,10 +1743,11 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -1822,26 +1837,27 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm5, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1956,30 +1972,31 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-LABEL: trunc_usat_v4i64_v4i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm4 -; SSE41-NEXT: packuswb %xmm4, %xmm4 -; SSE41-NEXT: movd %xmm4, (%rdi) +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm5, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm3 +; SSE41-NEXT: movd %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8_store: @@ -2121,53 +2138,56 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8: @@ -2303,53 +2323,56 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm6 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: packusdw %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pxor %xmm5, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm7 -; SSE41-NEXT: packuswb %xmm7, %xmm7 -; SSE41-NEXT: movq %xmm7, (%rsi) +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE41-NEXT: packusdw %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm6 +; SSE41-NEXT: movq %xmm6, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: @@ -2538,96 +2561,103 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa 96(%rdi), %xmm3 -; SSE41-NEXT: movdqa 112(%rdi), %xmm5 -; SSE41-NEXT: movdqa 64(%rdi), %xmm8 -; SSE41-NEXT: movdqa 80(%rdi), %xmm9 -; SSE41-NEXT: movdqa (%rdi), %xmm12 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm10 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm13 -; SSE41-NEXT: pxor %xmm7, %xmm13 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm13 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa 96(%rdi), %xmm2 +; SSE41-NEXT: movdqa 112(%rdi), %xmm4 +; SSE41-NEXT: movdqa 64(%rdi), %xmm7 +; SSE41-NEXT: movdqa 80(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm11 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm9 +; SSE41-NEXT: movdqa 48(%rdi), %xmm10 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 ; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pxor %xmm7, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 -; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm7, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm12 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: packusdw %xmm12, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 ; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pxor %xmm7, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm11 +; SSE41-NEXT: movapd %xmm3, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 -; SSE41-NEXT: packusdw %xmm12, %xmm11 -; SSE41-NEXT: packusdw %xmm11, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm10 -; SSE41-NEXT: pxor %xmm7, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm10 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm7, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm9 +; SSE41-NEXT: packusdw %xmm11, %xmm10 +; SSE41-NEXT: packusdw %xmm10, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 -; SSE41-NEXT: packusdw %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pxor %xmm7, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm8, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm9 -; SSE41-NEXT: packuswb %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: packusdw %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 ; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm8 +; SSE41-NEXT: packuswb %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v16i64_v16i8: diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index 54dc107fd0c10..c3e9a2b6841ae 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -3176,11 +3176,10 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> % ; ; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index be6ee8f689958..bfd25aa667d2d 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -636,9 +636,9 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i ; ; SSE41-LABEL: shrunkblend_nonvselectuse: ; SSE41: # %bb.0: -; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -770,24 +770,14 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) { ; SSE-NEXT: shll $15, %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: vselect_any_extend_vector_inreg_crash: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: shll $15, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: vselect_any_extend_vector_inreg_crash: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [49,49,49,49] -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: shll $15, %eax -; AVX2-NEXT: retq +; AVX-LABEL: vselect_any_extend_vector_inreg_crash: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: shll $15, %eax +; AVX-NEXT: retq 0: %1 = load <8 x i8>, ptr %x %2 = icmp eq <8 x i8> %1, From 4e7858ec9ec14a1dd6fc826103f654b67d4a47f7 Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Wed, 14 May 2025 16:22:52 +0200 Subject: [PATCH 2/3] Adding DoNotPoisonEltMask to SimplifyDemandedVectorElts --- llvm/include/llvm/CodeGen/TargetLowering.h | 9 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 185 ++-- llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll | 7 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 602 ++++++------ llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 90 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 896 ++++++++---------- .../AMDGPU/load-range-metadata-sign-bits.ll | 5 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 13 +- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 60 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 80 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 16 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 80 +- llvm/test/CodeGen/ARM/fpclamptosat_vec.ll | 444 ++++----- .../CodeGen/Thumb2/mve-fpclamptosat_vec.ll | 80 +- .../Thumb2/mve-gather-ind8-unscaled.ll | 5 + .../CodeGen/Thumb2/mve-laneinterleaving.ll | 86 +- llvm/test/CodeGen/Thumb2/mve-pred-ext.ll | 1 - llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll | 213 ++--- .../Thumb2/mve-scatter-ind8-unscaled.ll | 9 +- .../CodeGen/Thumb2/mve-vecreduce-addpred.ll | 8 +- .../CodeGen/Thumb2/mve-vecreduce-mlapred.ll | 8 +- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 46 +- .../X86/avx512vl-intrinsics-upgrade.ll | 52 +- ...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 48 +- llvm/test/CodeGen/X86/known-signbits-shl.ll | 3 +- .../test/CodeGen/X86/known-signbits-vector.ll | 30 +- ...of-two-or-zero-when-comparing-with-zero.ll | 2 +- llvm/test/CodeGen/X86/pr42727.ll | 2 +- .../test/CodeGen/X86/rotate-extract-vector.ll | 2 - llvm/test/CodeGen/X86/shrink_vmul.ll | 32 +- .../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 20 +- .../CodeGen/X86/urem-seteq-illegal-types.ll | 8 +- .../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 10 +- llvm/test/CodeGen/X86/vec_smulo.ll | 70 +- llvm/test/CodeGen/X86/vec_umulo.ll | 130 ++- llvm/test/CodeGen/X86/vector-mul.ll | 64 +- ...vector_splat-const-shift-of-constmasked.ll | 9 +- 38 files changed, 1618 insertions(+), 1811 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 03099e9ad44dc..719cb472b785c 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4192,6 +4192,15 @@ class TargetLowering : public TargetLoweringBase { /// results of this function, because simply replacing TLO.Old /// with TLO.New will be incorrect when this parameter is true and TLO.Old /// has multiple uses. + /// Vector elements that aren't demanded can be turned into poison unless the + /// corresponding bi in the \p DoNotPoisonEltMask is set. + bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, + const APInt &DoNotPoisonEltMask, + APInt &KnownUndef, APInt &KnownZero, + TargetLoweringOpt &TLO, unsigned Depth = 0, + bool AssumeSingleUse = false) const; + /// Version of SimplifyDemandedVectorElts without the DoNotPoisonEltMask + /// argument. All undemanded elements can be turned into poison. bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0, diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 09c6218b3dfd9..db51953207ad0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1414,7 +1414,9 @@ bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, bool AssumeSingleUse) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); APInt KnownUndef, KnownZero; - if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, + APInt DoNotPoisonElts = APInt::getZero(DemandedElts.getBitWidth()); + if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DoNotPoisonElts, + KnownUndef, KnownZero, TLO, 0, AssumeSingleUse)) return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d5697b6031537..0dbe7eba4875b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2754,25 +2754,34 @@ bool TargetLowering::SimplifyDemandedBits( unsigned Scale = BitWidth / NumSrcEltBits; unsigned NumSrcElts = SrcVT.getVectorNumElements(); APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); + APInt DemandedSrcElts = APInt::getZero(NumSrcElts); for (unsigned i = 0; i != Scale; ++i) { unsigned EltOffset = IsLE ? i : (Scale - 1 - i); unsigned BitOffset = EltOffset * NumSrcEltBits; APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset); - if (!Sub.isZero()) + if (!Sub.isZero()) { DemandedSrcBits |= Sub; + for (unsigned j = 0; j != NumElts; ++j) + if (DemandedElts[j]) + DemandedSrcElts.setBit((j * Scale) + i); + } } - // Need to demand all smaller source elements that maps to a demanded - // destination element, since recursive calls below may turn not demanded - // elements into poison. - APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); + // Need to "semi demand" all smaller source elements that maps to a + // demanded destination element, since recursive calls below may turn not + // demanded elements into poison. Instead of demanding such elements we + // use a special bitmask to indicate that the recursive calls must not + // turn such elements into poison. + APInt NoPoisonSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); APInt KnownSrcUndef, KnownSrcZero; - if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef, + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, + NoPoisonSrcElts, KnownSrcUndef, KnownSrcZero, TLO, Depth + 1)) return true; KnownBits KnownSrcBits; - if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, + if (SimplifyDemandedBits(Src, DemandedSrcBits, + DemandedSrcElts | NoPoisonSrcElts, KnownSrcBits, TLO, Depth + 1)) return true; } else if (IsLE && (NumSrcEltBits % BitWidth) == 0) { @@ -2790,7 +2799,9 @@ bool TargetLowering::SimplifyDemandedBits( if (SrcVT.isVector()) { APInt KnownSrcUndef, KnownSrcZero; - if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef, + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, + APInt::getZero(NumSrcElts), + KnownSrcUndef, KnownSrcZero, TLO, Depth + 1)) return true; } @@ -3010,8 +3021,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op, !DCI.isBeforeLegalizeOps()); APInt KnownUndef, KnownZero; + APInt DoNotPoisonEltMask = APInt::getZero(DemandedElts.getBitWidth()); bool Simplified = - SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO); + SimplifyDemandedVectorElts(Op, DemandedElts, DoNotPoisonEltMask, + KnownUndef, KnownZero, TLO); if (Simplified) { DCI.AddToWorklist(Op.getNode()); DCI.CommitTargetLoweringOpt(TLO); @@ -3069,7 +3082,19 @@ static APInt getKnownUndefForVectorBinop(SDValue BO, SelectionDAG &DAG, } bool TargetLowering::SimplifyDemandedVectorElts( - SDValue Op, const APInt &OriginalDemandedElts, APInt &KnownUndef, + SDValue Op, const APInt &OriginalDemandedElts, + APInt &KnownUndef, + APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth, + bool AssumeSingleUse) const { + APInt DoNotPoisonEltMask = APInt::getZero(OriginalDemandedElts.getBitWidth()); + return SimplifyDemandedVectorElts(Op, OriginalDemandedElts, DoNotPoisonEltMask, + KnownUndef, KnownZero, TLO, Depth, AssumeSingleUse); +} + +bool TargetLowering::SimplifyDemandedVectorElts( + SDValue Op, const APInt &OriginalDemandedElts, + const APInt &DoNotPoisonEltMask, + APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth, bool AssumeSingleUse) const { EVT VT = Op.getValueType(); @@ -3110,6 +3135,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( if (Depth >= SelectionDAG::MaxRecursionDepth) return false; + APInt DemandedEltsInclDoNotPoison = DemandedElts | DoNotPoisonEltMask; SDLoc DL(Op); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool IsLE = TLO.DAG.getDataLayout().isLittleEndian(); @@ -3117,9 +3143,9 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Helper for demanding the specified elements and all the bits of both binary // operands. auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) { - SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(Op0, DemandedElts, + SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(Op0, DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1); - SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts, + SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1); if (NewOp0 || NewOp1) { SDValue NewOp = @@ -3150,9 +3176,13 @@ bool TargetLowering::SimplifyDemandedVectorElts( unsigned NumSrcElts = SrcEltCnt.getFixedValue(); if (isNullConstant(Idx)) { APInt SrcDemandedElts = APInt::getOneBitSet(NumSrcElts, 0); + APInt SrcDoNotPoisonEltMask = APInt::getZero(NumSrcElts); + APInt SrcUndef = KnownUndef.zextOrTrunc(NumSrcElts); APInt SrcZero = KnownZero.zextOrTrunc(NumSrcElts); - if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, + if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, + SrcDoNotPoisonEltMask, + SrcUndef, SrcZero, TLO, Depth + 1)) return true; } @@ -3185,17 +3215,19 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Fast handling of 'identity' bitcasts. unsigned NumSrcElts = SrcVT.getVectorNumElements(); if (NumSrcElts == NumElts) - return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, + return SimplifyDemandedVectorElts(Src, DemandedElts, DoNotPoisonEltMask, + KnownUndef, KnownZero, TLO, Depth + 1); - APInt SrcDemandedElts, SrcZero, SrcUndef; + APInt SrcDemandedElts, SrcDoNotPoisonEltMask, SrcZero, SrcUndef; // Bitcast from 'large element' src vector to 'small element' vector, we // must demand a source element if any DemandedElt maps to it. if ((NumElts % NumSrcElts) == 0) { unsigned Scale = NumElts / NumSrcElts; SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); - if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, + SrcDoNotPoisonEltMask = APIntOps::ScaleBitMask(DoNotPoisonEltMask, NumSrcElts); + if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcDoNotPoisonEltMask, SrcUndef, SrcZero, TLO, Depth + 1)) return true; @@ -3206,7 +3238,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits(); APInt SrcDemandedBits = APInt::getZero(SrcEltSizeInBits); for (unsigned i = 0; i != NumElts; ++i) - if (DemandedElts[i]) { + if (DemandedEltsInclDoNotPoison[i]) { unsigned Ofs = (i % Scale) * EltSizeInBits; SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits); } @@ -3250,7 +3282,9 @@ bool TargetLowering::SimplifyDemandedVectorElts( if ((NumSrcElts % NumElts) == 0) { unsigned Scale = NumSrcElts / NumElts; SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); - if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, + SrcDoNotPoisonEltMask = APIntOps::ScaleBitMask(DoNotPoisonEltMask, NumSrcElts); + if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, + SrcDoNotPoisonEltMask, SrcUndef, SrcZero, TLO, Depth + 1)) return true; @@ -3269,7 +3303,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( } case ISD::FREEZE: { SDValue N0 = Op.getOperand(0); - if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, + if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedEltsInclDoNotPoison, /*PoisonOnly=*/false)) return TLO.CombineTo(Op, N0); @@ -3318,8 +3352,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( for (unsigned i = 0; i != NumSubVecs; ++i) { SDValue SubOp = Op.getOperand(i); APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts); + APInt DoNotPoisonSubElts = DoNotPoisonEltMask.extractBits(NumSubElts, i * NumSubElts); APInt SubUndef, SubZero; - if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO, + if (SimplifyDemandedVectorElts(SubOp, SubElts, DoNotPoisonSubElts, + SubUndef, SubZero, TLO, Depth + 1)) return true; KnownUndef.insertBits(SubUndef, i * NumSubElts); @@ -3327,12 +3363,12 @@ bool TargetLowering::SimplifyDemandedVectorElts( } // Attempt to avoid multi-use ops if we don't need anything from them. - if (!DemandedElts.isAllOnes()) { + if (!DemandedEltsInclDoNotPoison.isAllOnes()) { bool FoundNewSub = false; SmallVector DemandedSubOps; for (unsigned i = 0; i != NumSubVecs; ++i) { SDValue SubOp = Op.getOperand(i); - APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts); + APInt SubElts = DemandedEltsInclDoNotPoison.extractBits(NumSubElts, i * NumSubElts); SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts( SubOp, SubElts, TLO.DAG, Depth + 1); DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp); @@ -3354,11 +3390,15 @@ bool TargetLowering::SimplifyDemandedVectorElts( uint64_t Idx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx); + APInt DoNoPoisonSubElts = DoNotPoisonEltMask.extractBits(NumSubElts, Idx); APInt DemandedSrcElts = DemandedElts; DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx); + APInt DoNoPoisonSrcElts = DoNotPoisonEltMask; + DoNoPoisonSrcElts.insertBits(APInt::getZero(NumSubElts), Idx); APInt SubUndef, SubZero; - if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO, + if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, DoNoPoisonSubElts, + SubUndef, SubZero, TLO, Depth + 1)) return true; @@ -3368,7 +3408,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( TLO.DAG.getUNDEF(VT), Sub, Op.getOperand(2))); - if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownUndef, KnownZero, + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, DoNoPoisonSrcElts, + KnownUndef, KnownZero, TLO, Depth + 1)) return true; KnownUndef.insertBits(SubUndef, Idx); @@ -3377,9 +3418,9 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Attempt to avoid multi-use ops if we don't need anything from them. if (!DemandedSrcElts.isAllOnes() || !DemandedSubElts.isAllOnes()) { SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( - Src, DemandedSrcElts, TLO.DAG, Depth + 1); + Src, DemandedSrcElts | DoNoPoisonSrcElts, TLO.DAG, Depth + 1); SDValue NewSub = SimplifyMultipleUseDemandedVectorElts( - Sub, DemandedSubElts, TLO.DAG, Depth + 1); + Sub, DemandedSubElts | DoNoPoisonSubElts, TLO.DAG, Depth + 1); if (NewSrc || NewSub) { NewSrc = NewSrc ? NewSrc : Src; NewSub = NewSub ? NewSub : Sub; @@ -3398,9 +3439,11 @@ bool TargetLowering::SimplifyDemandedVectorElts( uint64_t Idx = Op.getConstantOperandVal(1); unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx); + APInt DoNotPoisonDemandedSrcElts = DoNotPoisonEltMask.zext(NumSrcElts).shl(Idx); + APInt DemandedSrcEltsInclDoNotPoison = DemandedEltsInclDoNotPoison.zext(NumSrcElts).shl(Idx); APInt SrcUndef, SrcZero; - if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO, + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, DoNotPoisonDemandedSrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; KnownUndef = SrcUndef.extractBits(NumElts, Idx); @@ -3409,7 +3452,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Attempt to avoid multi-use ops if we don't need anything from them. if (!DemandedElts.isAllOnes()) { SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( - Src, DemandedSrcElts, TLO.DAG, Depth + 1); + Src, DemandedSrcEltsInclDoNotPoison, TLO.DAG, Depth + 1); if (NewSrc) { SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc, Op.getOperand(1)); @@ -3427,13 +3470,20 @@ bool TargetLowering::SimplifyDemandedVectorElts( // then strip it, else remove it from the demanded elts. if (CIdx && CIdx->getAPIntValue().ult(NumElts)) { unsigned Idx = CIdx->getZExtValue(); - if (!DemandedElts[Idx]) + // FIXME: We could perhaps optimize this better. If the element isn't + // demanded but most not be poisoned, then we could insert UNDEF instead + // of the orignal value. Or if we know that the source element is + // guaranteed not to be poison then we can drop the INSERT_VECTOR_ELT + // anyway. + if (!DemandedEltsInclDoNotPoison[Idx]) return TLO.CombineTo(Op, Vec); APInt DemandedVecElts(DemandedElts); DemandedVecElts.clearBit(Idx); - if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, - KnownZero, TLO, Depth + 1)) + APInt SrcDoNotPoisonEltMask(DoNotPoisonEltMask); + SrcDoNotPoisonEltMask.clearBit(Idx); + if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, SrcDoNotPoisonEltMask, + KnownUndef, KnownZero, TLO, Depth + 1)) return true; KnownUndef.setBitVal(Idx, Scl.isUndef()); @@ -3443,7 +3493,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( } APInt VecUndef, VecZero; - if (SimplifyDemandedVectorElts(Vec, DemandedElts, VecUndef, VecZero, TLO, + if (SimplifyDemandedVectorElts(Vec, DemandedElts, DoNotPoisonEltMask, + VecUndef, VecZero, TLO, Depth + 1)) return true; // Without knowing the insertion index we can't set KnownUndef/KnownZero. @@ -3457,7 +3508,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Try to transform the select condition based on the current demanded // elements. APInt UndefSel, ZeroSel; - if (SimplifyDemandedVectorElts(Sel, DemandedElts, UndefSel, ZeroSel, TLO, + if (SimplifyDemandedVectorElts(Sel, DemandedElts, DoNotPoisonEltMask, + UndefSel, ZeroSel, TLO, Depth + 1)) return true; @@ -3466,10 +3518,12 @@ bool TargetLowering::SimplifyDemandedVectorElts( APInt DemandedRHS(DemandedElts); APInt UndefLHS, ZeroLHS; APInt UndefRHS, ZeroRHS; - if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedLHS, DoNotPoisonEltMask, + UndefLHS, ZeroLHS, TLO, Depth + 1)) return true; - if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedRHS, DoNotPoisonEltMask, + UndefRHS, ZeroRHS, TLO, Depth + 1)) return true; @@ -3480,8 +3534,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( // select value element. APInt DemandedSel = DemandedElts & ~KnownZero; if (DemandedSel != DemandedElts) - if (SimplifyDemandedVectorElts(Sel, DemandedSel, UndefSel, ZeroSel, TLO, - Depth + 1)) + if (SimplifyDemandedVectorElts(Sel, DemandedSel, DoNotPoisonEltMask, + UndefSel, ZeroSel, TLO, Depth + 1)) return true; break; @@ -3491,14 +3545,23 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDValue RHS = Op.getOperand(1); ArrayRef ShuffleMask = cast(Op)->getMask(); - // Collect demanded elements from shuffle operands.. + // Collect demanded elements from shuffle operands. APInt DemandedLHS(NumElts, 0); APInt DemandedRHS(NumElts, 0); + APInt DoNotPoisonLHS(NumElts, 0); + APInt DoNotPoisonRHS(NumElts, 0); for (unsigned i = 0; i != NumElts; ++i) { int M = ShuffleMask[i]; - if (M < 0 || !DemandedElts[i]) + if (M < 0 || !DemandedEltsInclDoNotPoison[i]) continue; assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range"); + if (!DemandedElts[i]) { + if (M < (int)NumElts) + DoNotPoisonLHS.setBit(M); + else + DoNotPoisonRHS.setBit(M - NumElts); + continue; + } if (M < (int)NumElts) DemandedLHS.setBit(M); else @@ -3508,10 +3571,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( // See if we can simplify either shuffle operand. APInt UndefLHS, ZeroLHS; APInt UndefRHS, ZeroRHS; - if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedLHS, DoNotPoisonLHS, UndefLHS, ZeroLHS, TLO, Depth + 1)) return true; - if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedRHS, DoNotPoisonRHS, UndefRHS, ZeroRHS, TLO, Depth + 1)) return true; @@ -3531,7 +3594,6 @@ bool TargetLowering::SimplifyDemandedVectorElts( IdentityLHS &= (M < 0) || (M == (int)i); IdentityRHS &= (M < 0) || ((M - NumElts) == i); } - // Update legal shuffle masks based on demanded elements if it won't reduce // to Identity which can cause premature removal of the shuffle mask. if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps) { @@ -3566,8 +3628,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( APInt SrcUndef, SrcZero; SDValue Src = Op.getOperand(0); unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); - APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts); - if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO, + APInt DemandedSrcElts = DemandedEltsInclDoNotPoison.zext(NumSrcElts); + APInt SrcDoNotPoisonEltMask = APInt::getZero(NumSrcElts); + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcDoNotPoisonEltMask, + SrcUndef, SrcZero, TLO, Depth + 1)) return true; KnownZero = SrcZero.zextOrTrunc(NumElts); @@ -3615,7 +3679,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDValue Op1 = Op.getOperand(1); if (Op0 == Op1 && Op->isOnlyUserOf(Op0.getNode())) { APInt UndefLHS, ZeroLHS; - if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO, + if (SimplifyDemandedVectorElts(Op0, DemandedElts, DoNotPoisonEltMask, UndefLHS, ZeroLHS, TLO, Depth + 1, /*AssumeSingleUse*/ true)) return true; } @@ -3637,11 +3701,11 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDValue Op1 = Op.getOperand(1); APInt UndefRHS, ZeroRHS; - if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO, + if (SimplifyDemandedVectorElts(Op1, DemandedElts, DoNotPoisonEltMask, UndefRHS, ZeroRHS, TLO, Depth + 1)) return true; APInt UndefLHS, ZeroLHS; - if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO, + if (SimplifyDemandedVectorElts(Op0, DemandedElts, DoNotPoisonEltMask, UndefLHS, ZeroLHS, TLO, Depth + 1)) return true; @@ -3650,7 +3714,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Attempt to avoid multi-use ops if we don't need anything from them. // TODO - use KnownUndef to relax the demandedelts? - if (!DemandedElts.isAllOnes()) + if (!DemandedEltsInclDoNotPoison.isAllOnes()) if (SimplifyDemandedVectorEltsBinOp(Op0, Op1)) return true; break; @@ -3664,11 +3728,11 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDValue Op1 = Op.getOperand(1); APInt UndefRHS, ZeroRHS; - if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO, + if (SimplifyDemandedVectorElts(Op1, DemandedElts, DoNotPoisonEltMask, UndefRHS, ZeroRHS, TLO, Depth + 1)) return true; APInt UndefLHS, ZeroLHS; - if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO, + if (SimplifyDemandedVectorElts(Op0, DemandedElts, DoNotPoisonEltMask, UndefLHS, ZeroLHS, TLO, Depth + 1)) return true; @@ -3677,7 +3741,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Attempt to avoid multi-use ops if we don't need anything from them. // TODO - use KnownUndef to relax the demandedelts? - if (!DemandedElts.isAllOnes()) + if (!DemandedEltsInclDoNotPoison.isAllOnes()) if (SimplifyDemandedVectorEltsBinOp(Op0, Op1)) return true; break; @@ -3690,13 +3754,13 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDValue Op1 = Op.getOperand(1); APInt SrcUndef, SrcZero; - if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO, + if (SimplifyDemandedVectorElts(Op1, DemandedElts, DoNotPoisonEltMask, SrcUndef, SrcZero, TLO, Depth + 1)) return true; // If we know that a demanded element was zero in Op1 we don't need to // demand it in Op0 - its guaranteed to be zero. APInt DemandedElts0 = DemandedElts & ~SrcZero; - if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero, + if (SimplifyDemandedVectorElts(Op0, DemandedElts0, DoNotPoisonEltMask, KnownUndef, KnownZero, TLO, Depth + 1)) return true; @@ -3718,7 +3782,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( KnownUndef &= ~KnownZero; // Attempt to avoid multi-use ops if we don't need anything from them. - if (!DemandedElts.isAllOnes()) + if (!DemandedEltsInclDoNotPoison.isAllOnes()) if (SimplifyDemandedVectorEltsBinOp(Op0, Op1)) return true; break; @@ -3726,13 +3790,13 @@ bool TargetLowering::SimplifyDemandedVectorElts( case ISD::TRUNCATE: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef, + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, DoNotPoisonEltMask, KnownUndef, KnownZero, TLO, Depth + 1)) return true; - if (!DemandedElts.isAllOnes()) + if (!DemandedEltsInclDoNotPoison.isAllOnes()) if (SDValue NewOp = SimplifyMultipleUseDemandedVectorElts( - Op.getOperand(0), DemandedElts, TLO.DAG, Depth + 1)) + Op.getOperand(0), DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1)) return TLO.CombineTo(Op, TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp)); if (Op.getOpcode() == ISD::ZERO_EXTEND) { @@ -3746,20 +3810,21 @@ bool TargetLowering::SimplifyDemandedVectorElts( case ISD::UINT_TO_FP: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: - if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef, + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, DoNotPoisonEltMask, KnownUndef, KnownZero, TLO, Depth + 1)) return true; // Don't fall through to generic undef -> undef handling. return false; default: { if (Op.getOpcode() >= ISD::BUILTIN_OP_END) { - if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef, + if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts | DoNotPoisonEltMask, KnownUndef, KnownZero, TLO, Depth)) return true; } else { KnownBits Known; APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); - if (SimplifyDemandedBits(Op, DemandedBits, OriginalDemandedElts, Known, + if (SimplifyDemandedBits(Op, DemandedBits, + OriginalDemandedElts | DoNotPoisonEltMask, Known, TLO, Depth, AssumeSingleUse)) return true; } diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index c29039b86e82b..933c6506d0270 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -150,10 +150,9 @@ define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) { ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0 -; CHECK-NEXT: v_mul_hi_u32 v6, v1, v0 -; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[4:5] -; CHECK-NEXT: flat_store_dword v[2:3], v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v0 +; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[3:4] +; CHECK-NEXT: flat_store_dword v[2:3], v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 6b7f648f65a45..a9240eff8e691 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -8340,216 +8340,191 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX6-NEXT: s_mov_b32 s7, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s11, s7 -; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s17, s7 -; GFX6-NEXT: s_mov_b32 s19, s7 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s6, s5, 30 -; GFX6-NEXT: s_lshr_b32 s8, s5, 28 -; GFX6-NEXT: s_lshr_b32 s10, s5, 29 -; GFX6-NEXT: s_lshr_b32 s12, s5, 26 -; GFX6-NEXT: s_lshr_b32 s16, s5, 27 -; GFX6-NEXT: s_mov_b32 s18, s5 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[18:19], 0x10000 -; GFX6-NEXT: s_ashr_i32 s18, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x10000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s31, s7 -; GFX6-NEXT: s_mov_b32 s35, s7 -; GFX6-NEXT: s_mov_b32 s25, s7 -; GFX6-NEXT: s_mov_b32 s27, s7 -; GFX6-NEXT: s_mov_b32 s21, s7 -; GFX6-NEXT: s_mov_b32 s23, s7 -; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s42, s5, 30 +; GFX6-NEXT: s_lshr_b32 s36, s5, 28 +; GFX6-NEXT: s_lshr_b32 s38, s5, 29 +; GFX6-NEXT: s_lshr_b32 s30, s5, 26 +; GFX6-NEXT: s_lshr_b32 s34, s5, 27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 24 +; GFX6-NEXT: s_lshr_b32 s28, s5, 25 +; GFX6-NEXT: s_lshr_b32 s22, s5, 22 +; GFX6-NEXT: s_lshr_b32 s24, s5, 23 +; GFX6-NEXT: s_lshr_b32 s18, s5, 20 +; GFX6-NEXT: s_lshr_b32 s20, s5, 21 +; GFX6-NEXT: s_lshr_b32 s14, s5, 18 +; GFX6-NEXT: s_lshr_b32 s16, s5, 19 +; GFX6-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NEXT: s_lshr_b32 s12, s5, 17 +; GFX6-NEXT: s_lshr_b32 s6, s5, 14 +; GFX6-NEXT: s_lshr_b32 s8, s5, 15 +; GFX6-NEXT: s_mov_b32 s40, s5 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s7 +; GFX6-NEXT: s_lshr_b32 s40, s5, 12 ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_mov_b32 s45, s7 -; GFX6-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NEXT: v_mov_b32_e32 v7, s15 -; GFX6-NEXT: s_mov_b32 s47, s7 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s44 +; GFX6-NEXT: v_mov_b32_e32 v7, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 13 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_mov_b32 s43, s7 -; GFX6-NEXT: v_mov_b32_e32 v8, s40 -; GFX6-NEXT: v_mov_b32_e32 v9, s41 -; GFX6-NEXT: s_mov_b32 s41, s7 +; GFX6-NEXT: s_lshr_b32 s42, s5, 10 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 11 ; GFX6-NEXT: v_mov_b32_e32 v10, s38 ; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_mov_b32 s39, s7 -; GFX6-NEXT: v_mov_b32_e32 v12, s36 -; GFX6-NEXT: v_mov_b32_e32 v13, s37 -; GFX6-NEXT: s_mov_b32 s15, s7 -; GFX6-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NEXT: s_mov_b32 s37, s7 -; GFX6-NEXT: s_lshr_b32 s30, s5, 24 -; GFX6-NEXT: s_lshr_b32 s34, s5, 25 +; GFX6-NEXT: s_lshr_b32 s38, s5, 8 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[30:31], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v5, s18 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s5, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s34 +; GFX6-NEXT: v_mov_b32_e32 v15, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 6 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NEXT: s_mov_b32 s29, s7 -; GFX6-NEXT: v_mov_b32_e32 v4, s34 -; GFX6-NEXT: v_mov_b32_e32 v5, s35 -; GFX6-NEXT: s_lshr_b32 s24, s5, 22 -; GFX6-NEXT: s_lshr_b32 s26, s5, 23 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: s_lshr_b32 s28, s5, 4 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s24 -; GFX6-NEXT: v_mov_b32_e32 v9, s25 -; GFX6-NEXT: s_mov_b32 s25, s7 -; GFX6-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NEXT: s_mov_b32 s27, s7 -; GFX6-NEXT: s_lshr_b32 s20, s5, 20 -; GFX6-NEXT: s_lshr_b32 s22, s5, 21 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s22 +; GFX6-NEXT: v_mov_b32_e32 v9, s23 +; GFX6-NEXT: s_lshr_b32 s22, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v10, s24 +; GFX6-NEXT: v_mov_b32_e32 v11, s25 +; GFX6-NEXT: s_lshr_b32 s24, s5, 2 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s20 -; GFX6-NEXT: v_mov_b32_e32 v13, s21 -; GFX6-NEXT: s_mov_b32 s35, s7 -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_mov_b32 s21, s7 -; GFX6-NEXT: s_mov_b32 s23, s7 -; GFX6-NEXT: s_lshr_b32 s16, s5, 18 -; GFX6-NEXT: s_lshr_b32 s18, s5, 19 -; GFX6-NEXT: s_lshr_b32 s10, s5, 16 -; GFX6-NEXT: s_lshr_b32 s12, s5, 17 -; GFX6-NEXT: s_lshr_b32 s8, s5, 14 -; GFX6-NEXT: s_lshr_b32 s44, s5, 15 -; GFX6-NEXT: s_lshr_b32 s46, s5, 12 -; GFX6-NEXT: s_lshr_b32 s42, s5, 13 -; GFX6-NEXT: s_lshr_b32 s40, s5, 10 -; GFX6-NEXT: s_lshr_b32 s38, s5, 11 -; GFX6-NEXT: s_lshr_b32 s14, s5, 8 -; GFX6-NEXT: s_lshr_b32 s36, s5, 9 -; GFX6-NEXT: s_lshr_b32 s28, s5, 6 -; GFX6-NEXT: s_lshr_b32 s30, s5, 7 -; GFX6-NEXT: s_lshr_b32 s24, s5, 4 -; GFX6-NEXT: s_lshr_b32 s26, s5, 5 -; GFX6-NEXT: s_lshr_b32 s34, s5, 2 -; GFX6-NEXT: s_lshr_b32 s20, s5, 3 -; GFX6-NEXT: s_lshr_b32 s22, s5, 1 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s18 +; GFX6-NEXT: v_mov_b32_e32 v13, s19 +; GFX6-NEXT: s_lshr_b32 s18, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 30 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: s_lshr_b32 s18, s4, 31 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 30 +; GFX6-NEXT: v_mov_b32_e32 v4, s16 +; GFX6-NEXT: v_mov_b32_e32 v5, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 31 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s10 ; GFX6-NEXT: v_mov_b32_e32 v9, s11 ; GFX6-NEXT: s_lshr_b32 s10, s4, 28 -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v11, s7 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 ; GFX6-NEXT: s_lshr_b32 s12, s4, 29 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 26 -; GFX6-NEXT: v_mov_b32_e32 v14, s6 -; GFX6-NEXT: v_mov_b32_e32 v15, s7 -; GFX6-NEXT: s_lshr_b32 s44, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[46:47], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NEXT: s_lshr_b32 s46, s4, 26 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 24 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: s_lshr_b32 s46, s4, 25 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x10000 +; GFX6-NEXT: s_lshr_b32 s44, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 22 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 22 ; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s40, s4, 23 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_lshr_b32 s42, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s14 -; GFX6-NEXT: v_mov_b32_e32 v13, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 20 ; GFX6-NEXT: v_mov_b32_e32 v14, s6 ; GFX6-NEXT: v_mov_b32_e32 v15, s7 ; GFX6-NEXT: s_lshr_b32 s6, s4, 21 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 -; GFX6-NEXT: v_mov_b32_e32 v16, s28 -; GFX6-NEXT: v_mov_b32_e32 v17, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v18, s30 -; GFX6-NEXT: v_mov_b32_e32 v19, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 19 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NEXT: v_mov_b32_e32 v19, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s24 -; GFX6-NEXT: v_mov_b32_e32 v9, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s28 +; GFX6-NEXT: v_mov_b32_e32 v9, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 17 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s34 -; GFX6-NEXT: v_mov_b32_e32 v13, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 14 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 12 +; GFX6-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NEXT: v_mov_b32_e32 v13, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 14 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s18 +; GFX6-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 12 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s16 -; GFX6-NEXT: v_mov_b32_e32 v17, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v18, s18 -; GFX6-NEXT: v_mov_b32_e32 v19, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v18, s16 +; GFX6-NEXT: v_mov_b32_e32 v19, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 @@ -8560,62 +8535,62 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v10, s12 ; GFX6-NEXT: v_mov_b32_e32 v11, s13 ; GFX6-NEXT: s_lshr_b32 s12, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v14, s36 -; GFX6-NEXT: v_mov_b32_e32 v15, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 7 ; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: v_mov_b32_e32 v3, s45 ; GFX6-NEXT: s_lshr_b32 s44, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s38 -; GFX6-NEXT: v_mov_b32_e32 v17, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v18, s40 -; GFX6-NEXT: v_mov_b32_e32 v19, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s36 +; GFX6-NEXT: v_mov_b32_e32 v17, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v18, s42 +; GFX6-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s14 -; GFX6-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v8, s30 +; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 @@ -8624,58 +8599,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v11, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(2) +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s28 ; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: v_mov_b32_e32 v8, s4 ; GFX6-NEXT: v_mov_b32_e32 v9, s5 @@ -8686,16 +8661,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; GFX8-NEXT: s_mov_b32 s69, 0 -; GFX8-NEXT: s_mov_b32 s67, s69 -; GFX8-NEXT: s_mov_b32 s41, s69 -; GFX8-NEXT: s_mov_b32 s61, s69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; GFX8-NEXT: s_mov_b32 s43, s69 -; GFX8-NEXT: s_mov_b32 s65, s69 -; GFX8-NEXT: s_mov_b32 s45, s69 -; GFX8-NEXT: s_mov_b32 s57, s69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s0, s3, 8 ; GFX8-NEXT: s_lshr_b32 s48, s3, 15 @@ -8708,15 +8675,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s36, s3, 27 ; GFX8-NEXT: s_lshr_b32 s68, s3, 24 ; GFX8-NEXT: s_lshr_b32 s38, s3, 25 -; GFX8-NEXT: s_lshr_b32 s66, s3, 22 +; GFX8-NEXT: s_lshr_b32 s64, s3, 22 ; GFX8-NEXT: s_lshr_b32 s40, s3, 23 ; GFX8-NEXT: s_lshr_b32 s60, s3, 20 ; GFX8-NEXT: s_lshr_b32 s42, s3, 21 -; GFX8-NEXT: s_lshr_b32 s64, s3, 18 +; GFX8-NEXT: s_lshr_b32 s66, s3, 18 ; GFX8-NEXT: s_lshr_b32 s44, s3, 19 ; GFX8-NEXT: s_lshr_b32 s56, s3, 16 ; GFX8-NEXT: s_lshr_b32 s46, s3, 17 -; GFX8-NEXT: s_mov_b32 s47, s69 ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 ; GFX8-NEXT: s_lshr_b32 s62, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 @@ -8725,14 +8691,13 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 ; GFX8-NEXT: v_writelane_b32 v62, s0, 2 -; GFX8-NEXT: s_mov_b32 s23, s69 ; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -8764,9 +8729,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v12, s72 ; GFX8-NEXT: v_mov_b32_e32 v0, s70 ; GFX8-NEXT: v_mov_b32_e32 v8, s68 -; GFX8-NEXT: v_mov_b32_e32 v16, s66 +; GFX8-NEXT: v_mov_b32_e32 v16, s64 ; GFX8-NEXT: v_mov_b32_e32 v20, s60 -; GFX8-NEXT: v_mov_b32_e32 v24, s64 +; GFX8-NEXT: v_mov_b32_e32 v24, s66 ; GFX8-NEXT: v_mov_b32_e32 v28, s56 ; GFX8-NEXT: v_mov_b32_e32 v32, s58 ; GFX8-NEXT: v_mov_b32_e32 v36, s62 @@ -8829,11 +8794,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NEXT: v_mov_b32_e32 v9, s69 ; GFX8-NEXT: v_mov_b32_e32 v11, s39 -; GFX8-NEXT: v_mov_b32_e32 v17, s67 +; GFX8-NEXT: v_mov_b32_e32 v17, s65 ; GFX8-NEXT: v_mov_b32_e32 v19, s41 ; GFX8-NEXT: v_mov_b32_e32 v21, s61 ; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_mov_b32_e32 v25, s65 +; GFX8-NEXT: v_mov_b32_e32 v25, s67 ; GFX8-NEXT: v_mov_b32_e32 v27, s45 ; GFX8-NEXT: v_mov_b32_e32 v29, s57 ; GFX8-NEXT: v_mov_b32_e32 v31, s47 @@ -9497,59 +9462,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s67, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s57, s67 -; GFX12-NEXT: s_mov_b32 s63, s67 -; GFX12-NEXT: s_mov_b32 s45, s67 -; GFX12-NEXT: s_mov_b32 s53, s67 -; GFX12-NEXT: s_mov_b32 s31, s67 -; GFX12-NEXT: s_mov_b32 s41, s67 -; GFX12-NEXT: s_mov_b32 s19, s67 -; GFX12-NEXT: s_mov_b32 s27, s67 -; GFX12-NEXT: s_mov_b32 s47, s67 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s96, s3, 30 -; GFX12-NEXT: s_lshr_b32 s98, s3, 31 -; GFX12-NEXT: s_lshr_b32 s92, s3, 28 -; GFX12-NEXT: s_lshr_b32 s94, s3, 29 -; GFX12-NEXT: s_lshr_b32 s78, s3, 26 -; GFX12-NEXT: s_lshr_b32 s88, s3, 27 +; GFX12-NEXT: s_lshr_b32 s96, s11, 30 +; GFX12-NEXT: s_lshr_b32 s98, s11, 31 +; GFX12-NEXT: s_lshr_b32 s92, s11, 28 +; GFX12-NEXT: s_lshr_b32 s94, s11, 29 +; GFX12-NEXT: s_lshr_b32 s78, s11, 26 +; GFX12-NEXT: s_lshr_b32 s88, s11, 27 ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX12-NEXT: s_lshr_b32 s66, s3, 24 -; GFX12-NEXT: s_lshr_b32 s74, s3, 25 +; GFX12-NEXT: s_lshr_b32 s66, s11, 24 +; GFX12-NEXT: s_lshr_b32 s74, s11, 25 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 -; GFX12-NEXT: s_lshr_b32 s56, s3, 22 -; GFX12-NEXT: s_lshr_b32 s62, s3, 23 +; GFX12-NEXT: s_lshr_b32 s56, s11, 22 +; GFX12-NEXT: s_lshr_b32 s62, s11, 23 ; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100 ; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92 ; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX12-NEXT: s_lshr_b32 s44, s3, 20 -; GFX12-NEXT: s_lshr_b32 s52, s3, 21 -; GFX12-NEXT: s_lshr_b32 s30, s3, 18 -; GFX12-NEXT: s_lshr_b32 s40, s3, 19 -; GFX12-NEXT: s_lshr_b32 s18, s3, 16 -; GFX12-NEXT: s_lshr_b32 s26, s3, 17 -; GFX12-NEXT: s_lshr_b32 s4, s3, 14 -; GFX12-NEXT: s_lshr_b32 s6, s3, 15 +; GFX12-NEXT: s_lshr_b32 s44, s11, 20 +; GFX12-NEXT: s_lshr_b32 s52, s11, 21 +; GFX12-NEXT: s_lshr_b32 s30, s11, 18 +; GFX12-NEXT: s_lshr_b32 s40, s11, 19 +; GFX12-NEXT: s_lshr_b32 s18, s11, 16 +; GFX12-NEXT: s_lshr_b32 s26, s11, 17 +; GFX12-NEXT: s_lshr_b32 s2, s11, 14 +; GFX12-NEXT: s_lshr_b32 s4, s11, 15 ; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 ; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX12-NEXT: s_lshr_b32 s8, s3, 12 -; GFX12-NEXT: s_lshr_b32 s10, s3, 13 +; GFX12-NEXT: s_lshr_b32 s6, s11, 12 +; GFX12-NEXT: s_lshr_b32 s8, s11, 13 ; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 ; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX12-NEXT: s_lshr_b32 s12, s3, 10 -; GFX12-NEXT: s_lshr_b32 s14, s3, 11 +; GFX12-NEXT: s_lshr_b32 s12, s11, 10 +; GFX12-NEXT: s_lshr_b32 s14, s11, 11 ; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 ; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 @@ -9558,16 +9512,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX12-NEXT: s_lshr_b32 s16, s3, 8 -; GFX12-NEXT: s_lshr_b32 s20, s3, 9 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX12-NEXT: s_lshr_b32 s16, s11, 8 +; GFX12-NEXT: s_lshr_b32 s20, s11, 9 ; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 ; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX12-NEXT: s_lshr_b32 s22, s3, 6 -; GFX12-NEXT: s_lshr_b32 s24, s3, 7 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: s_lshr_b32 s22, s11, 6 +; GFX12-NEXT: s_lshr_b32 s24, s11, 7 ; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52 ; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30 ; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40 @@ -9585,39 +9539,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:416 ; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:400 ; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:384 -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 -; GFX12-NEXT: v_mov_b32_e32 v5, s8 -; GFX12-NEXT: s_lshr_b32 s28, s3, 4 -; GFX12-NEXT: s_lshr_b32 s34, s3, 5 -; GFX12-NEXT: s_lshr_b32 s36, s3, 2 -; GFX12-NEXT: s_lshr_b32 s38, s3, 3 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX12-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-NEXT: s_lshr_b32 s28, s11, 4 +; GFX12-NEXT: s_lshr_b32 s34, s11, 5 +; GFX12-NEXT: s_lshr_b32 s36, s11, 2 +; GFX12-NEXT: s_lshr_b32 s38, s11, 3 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 -; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 -; GFX12-NEXT: s_lshr_b32 s42, s3, 1 -; GFX12-NEXT: s_mov_b32 s46, s3 +; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12 +; GFX12-NEXT: s_lshr_b32 s42, s11, 1 +; GFX12-NEXT: s_mov_b32 s46, s11 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 ; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 -; GFX12-NEXT: s_lshr_b32 s48, s2, 30 -; GFX12-NEXT: s_lshr_b32 s50, s2, 31 +; GFX12-NEXT: s_lshr_b32 s48, s10, 30 +; GFX12-NEXT: s_lshr_b32 s50, s10, 31 ; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20 ; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22 -; GFX12-NEXT: s_lshr_b32 s54, s2, 28 -; GFX12-NEXT: s_lshr_b32 s58, s2, 29 +; GFX12-NEXT: s_lshr_b32 s54, s10, 28 +; GFX12-NEXT: s_lshr_b32 s58, s10, 29 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24 ; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28 -; GFX12-NEXT: s_lshr_b32 s60, s2, 26 -; GFX12-NEXT: s_lshr_b32 s64, s2, 27 +; GFX12-NEXT: s_lshr_b32 s60, s10, 26 +; GFX12-NEXT: s_lshr_b32 s64, s10, 27 ; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34 @@ -9632,43 +9586,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37 ; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39 ; GFX12-NEXT: v_mov_b32_e32 v5, s46 -; GFX12-NEXT: s_lshr_b32 s68, s2, 24 -; GFX12-NEXT: s_lshr_b32 s70, s2, 25 -; GFX12-NEXT: s_lshr_b32 s72, s2, 22 -; GFX12-NEXT: s_lshr_b32 s76, s2, 23 +; GFX12-NEXT: s_lshr_b32 s68, s10, 24 +; GFX12-NEXT: s_lshr_b32 s70, s10, 25 +; GFX12-NEXT: s_lshr_b32 s72, s10, 22 +; GFX12-NEXT: s_lshr_b32 s76, s10, 23 ; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42 ; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48 -; GFX12-NEXT: s_lshr_b32 s80, s2, 20 -; GFX12-NEXT: s_lshr_b32 s82, s2, 21 +; GFX12-NEXT: s_lshr_b32 s80, s10, 20 +; GFX12-NEXT: s_lshr_b32 s82, s10, 21 ; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50 ; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54 -; GFX12-NEXT: s_lshr_b32 s84, s2, 18 -; GFX12-NEXT: s_lshr_b32 s86, s2, 19 +; GFX12-NEXT: s_lshr_b32 s84, s10, 18 +; GFX12-NEXT: s_lshr_b32 s86, s10, 19 ; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58 ; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60 -; GFX12-NEXT: s_lshr_b32 s90, s2, 16 -; GFX12-NEXT: s_lshr_b32 s98, s2, 17 +; GFX12-NEXT: s_lshr_b32 s90, s10, 16 +; GFX12-NEXT: s_lshr_b32 s98, s10, 17 ; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64 ; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68 -; GFX12-NEXT: s_lshr_b32 s96, s2, 14 -; GFX12-NEXT: s_lshr_b32 s100, s2, 15 -; GFX12-NEXT: s_lshr_b32 s94, s2, 13 -; GFX12-NEXT: s_lshr_b32 s88, s2, 11 -; GFX12-NEXT: s_lshr_b32 s74, s2, 9 -; GFX12-NEXT: s_lshr_b32 s62, s2, 7 -; GFX12-NEXT: s_lshr_b32 s52, s2, 5 -; GFX12-NEXT: s_lshr_b32 s40, s2, 3 -; GFX12-NEXT: s_lshr_b32 s26, s2, 1 +; GFX12-NEXT: s_lshr_b32 s96, s10, 14 +; GFX12-NEXT: s_lshr_b32 s100, s10, 15 +; GFX12-NEXT: s_lshr_b32 s94, s10, 13 +; GFX12-NEXT: s_lshr_b32 s88, s10, 11 +; GFX12-NEXT: s_lshr_b32 s74, s10, 9 +; GFX12-NEXT: s_lshr_b32 s62, s10, 7 +; GFX12-NEXT: s_lshr_b32 s52, s10, 5 +; GFX12-NEXT: s_lshr_b32 s40, s10, 3 +; GFX12-NEXT: s_lshr_b32 s26, s10, 1 ; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70 @@ -9683,19 +9637,19 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73 ; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77 ; GFX12-NEXT: v_mov_b32_e32 v5, s80 -; GFX12-NEXT: s_lshr_b32 s92, s2, 12 -; GFX12-NEXT: s_lshr_b32 s78, s2, 10 +; GFX12-NEXT: s_lshr_b32 s92, s10, 12 +; GFX12-NEXT: s_lshr_b32 s78, s10, 10 ; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82 ; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84 -; GFX12-NEXT: s_lshr_b32 s66, s2, 8 -; GFX12-NEXT: s_lshr_b32 s56, s2, 6 -; GFX12-NEXT: s_lshr_b32 s44, s2, 4 -; GFX12-NEXT: s_lshr_b32 s30, s2, 2 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 +; GFX12-NEXT: s_lshr_b32 s66, s10, 8 +; GFX12-NEXT: s_lshr_b32 s56, s10, 6 +; GFX12-NEXT: s_lshr_b32 s44, s10, 4 +; GFX12-NEXT: s_lshr_b32 s30, s10, 2 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 @@ -9739,8 +9693,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 ; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26 ; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18 -; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s2 -; GFX12-NEXT: v_mov_b32_e32 v24, s3 +; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10 +; GFX12-NEXT: v_mov_b32_e32 v24, s11 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 606568d8b149a..817c5def5614f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6209,7 +6209,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 16 @@ -6234,7 +6233,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 s5, 0 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6573,9 +6571,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 @@ -6621,8 +6617,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s3, 0 -; GCN-HSA-NEXT: s_mov_b32 s9, s3 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s2, s7 ; GCN-HSA-NEXT: s_mov_b32 s8, s5 @@ -7195,11 +7189,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s11 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s9 @@ -7277,12 +7267,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-HSA-NEXT: s_mov_b32 s7, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s11, s7 -; GCN-HSA-NEXT: s_mov_b32 s21, s7 -; GCN-HSA-NEXT: s_mov_b32 s23, s7 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s6, s19 ; GCN-HSA-NEXT: s_mov_b32 s10, s17 @@ -8341,21 +8327,20 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s69, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s69 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s68, s15 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s11 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[70:71], s[18:19], 0x100000 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s4, 16 @@ -8365,7 +8350,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[14:15], 0x100000 @@ -8376,22 +8361,15 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[68:69], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s51, s69 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s53, s69 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s57, s69 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s55, s69 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s43, s69 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s45, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s71 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s71 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s69 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -8427,7 +8405,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 @@ -8451,16 +8429,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -8494,16 +8472,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-HSA-NEXT: s_mov_b32 s43, 0 -; GCN-HSA-NEXT: s_mov_b32 s49, s43 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s51, s43 -; GCN-HSA-NEXT: s_mov_b32 s53, s43 -; GCN-HSA-NEXT: s_mov_b32 s55, s43 -; GCN-HSA-NEXT: s_mov_b32 s57, s43 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s42, s15 +; GCN-HSA-NEXT: s_mov_b32 s40, s15 ; GCN-HSA-NEXT: s_mov_b32 s48, s13 ; GCN-HSA-NEXT: s_mov_b32 s50, s11 ; GCN-HSA-NEXT: s_mov_b32 s52, s9 @@ -8524,16 +8496,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 -; GCN-HSA-NEXT: s_mov_b32 s45, s43 -; GCN-HSA-NEXT: s_mov_b32 s59, s43 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[40:41], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 @@ -8550,7 +8520,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 @@ -8574,14 +8544,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s41 -; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43 +; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 @@ -8597,12 +8567,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] @@ -8626,10 +8596,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 10d7541d8722d..3b0f8523e1b52 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6395,10 +6395,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s9, s7 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, s7 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8 @@ -6441,13 +6438,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 s5, 0 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-HSA-NEXT: s_mov_b32 s7, s5 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX7-HSA-NEXT: s_mov_b32 s9, s5 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8 @@ -6502,14 +6496,12 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0 -; GFX8-NOHSA-NEXT: s_mov_b32 s7, s5 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s6, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 @@ -6517,8 +6509,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 @@ -6535,10 +6527,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6616,32 +6608,29 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s5 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 -; GFX12-NEXT: s_lshr_b32 s8, s3, 8 -; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_lshr_b32 s6, s3, 8 +; GFX12-NEXT: s_mov_b32 s8, s3 ; GFX12-NEXT: s_lshr_b32 s10, s2, 16 ; GFX12-NEXT: s_lshr_b32 s12, s2, 24 ; GFX12-NEXT: s_lshr_b32 s14, s2, 8 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 ; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s15 ; GFX12-NEXT: v_mov_b32_e32 v6, s14 @@ -7038,23 +7027,17 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s13, 0 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s15, s13 -; GFX6-NOHSA-NEXT: s_mov_b32 s5, s13 -; GFX6-NOHSA-NEXT: s_mov_b32 s17, s13 -; GFX6-NOHSA-NEXT: s_mov_b32 s19, s13 -; GFX6-NOHSA-NEXT: s_mov_b32 s21, s13 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s11, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s4, s11 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s10, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s10, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s10, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s9, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s9, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s20, s9 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s10, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s10, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s9, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s9, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s9 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s8, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s8, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s8, 8 @@ -7062,16 +7045,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[8:9], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 @@ -7082,8 +7065,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 @@ -7093,21 +7076,21 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 @@ -7124,42 +7107,35 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 s5, 0 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX7-HSA-NEXT: s_mov_b32 s13, s5 -; GFX7-HSA-NEXT: s_mov_b32 s15, s5 -; GFX7-HSA-NEXT: s_mov_b32 s23, s5 -; GFX7-HSA-NEXT: s_mov_b32 s25, s5 +; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s4, s11, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s11, 8 -; GFX7-HSA-NEXT: s_mov_b32 s14, s11 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s10, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s10, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s9, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s9, 8 -; GFX7-HSA-NEXT: s_mov_b32 s26, s9 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s8, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s8, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s8, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[6:7], s[8:9], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 -; GFX7-HSA-NEXT: s_mov_b32 s27, s5 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s24, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[6:7], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -7167,65 +7143,65 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s30, s0, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s31, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s13 -; GFX7-HSA-NEXT: s_add_u32 s12, s0, 0x50 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7240,16 +7216,12 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NOHSA-NEXT: s_mov_b32 s11, 0 -; GFX8-NOHSA-NEXT: s_mov_b32 s13, s11 -; GFX8-NOHSA-NEXT: s_mov_b32 s23, s11 -; GFX8-NOHSA-NEXT: s_mov_b32 s27, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s12, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s7 ; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 @@ -7272,8 +7244,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 @@ -7287,10 +7259,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7452,68 +7424,62 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s11, s3 -; GFX12-NEXT: s_mov_b32 s13, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s2, s7, 16 -; GFX12-NEXT: s_lshr_b32 s14, s7, 8 -; GFX12-NEXT: s_mov_b32 s8, s7 -; GFX12-NEXT: s_lshr_b32 s16, s6, 16 -; GFX12-NEXT: s_lshr_b32 s18, s6, 24 -; GFX12-NEXT: s_lshr_b32 s20, s6, 8 +; GFX12-NEXT: s_lshr_b32 s8, s7, 8 +; GFX12-NEXT: s_mov_b32 s10, s7 +; GFX12-NEXT: s_lshr_b32 s12, s6, 16 +; GFX12-NEXT: s_lshr_b32 s14, s6, 24 +; GFX12-NEXT: s_lshr_b32 s16, s6, 8 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_lshr_b32 s18, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v26, 0 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s35 ; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v9, s11 +; GFX12-NEXT: s_lshr_b32 s20, s5, 8 +; GFX12-NEXT: s_mov_b32 s22, s5 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v9, s15 -; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17 -; GFX12-NEXT: s_lshr_b32 s10, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: s_lshr_b32 s22, s5, 8 -; GFX12-NEXT: s_mov_b32 s12, s5 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s13 ; GFX12-NEXT: s_lshr_b32 s24, s4, 16 ; GFX12-NEXT: s_lshr_b32 s26, s4, 24 ; GFX12-NEXT: s_lshr_b32 s28, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v13, s19 -; GFX12-NEXT: v_mov_b32_e32 v12, s18 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v26, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v26, v[6:9], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v7, s21 :: v_dual_mov_b32 v6, s20 -; GFX12-NEXT: v_mov_b32_e32 v1, s11 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v17, s19 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s5 +; GFX12-NEXT: v_mov_b32_e32 v18, s4 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s23 -; GFX12-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v19, s25 -; GFX12-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v21, s27 -; GFX12-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v23, s31 -; GFX12-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v25, s29 -; GFX12-NEXT: v_mov_b32_e32 v24, s28 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v0, s22 +; GFX12-NEXT: v_dual_mov_b32 v3, s21 :: v_dual_mov_b32 v2, s20 +; GFX12-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24 +; GFX12-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v10, s26 +; GFX12-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v20, s30 +; GFX12-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v22, s28 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v26, v[10:13], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v26, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v26, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v26, v[14:17], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v26, v[18:21], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v26, v[22:25], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> @@ -8224,39 +8190,37 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s47, 0 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s31, s47 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s30, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s54, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s38, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s42, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s58, s1 +; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 ; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s0, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[30:31], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[60:61], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[38:39], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[2:3], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 @@ -8264,162 +8228,143 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s71 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s63 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s15, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s17, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s21, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s35, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s37, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s55, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s45, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s49, s47 -; GFX6-NOHSA-NEXT: s_mov_b32 s59, s47 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[64:65], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[52:53], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[38:39], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s65 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s63 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 s15, 0 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX7-HSA-NEXT: s_mov_b32 s47, s15 -; GFX7-HSA-NEXT: s_mov_b32 s49, s15 -; GFX7-HSA-NEXT: s_mov_b32 s59, s15 -; GFX7-HSA-NEXT: s_mov_b32 s61, s15 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s48, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s52, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 8 ; GFX7-HSA-NEXT: s_mov_b32 s62, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s20, s1, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 @@ -8429,20 +8374,13 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[18:19], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[44:45], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 -; GFX7-HSA-NEXT: s_mov_b32 s63, s15 -; GFX7-HSA-NEXT: s_mov_b32 s37, s15 -; GFX7-HSA-NEXT: s_mov_b32 s31, s15 -; GFX7-HSA-NEXT: s_mov_b32 s35, s15 -; GFX7-HSA-NEXT: s_mov_b32 s21, s15 -; GFX7-HSA-NEXT: s_mov_b32 s65, s15 -; GFX7-HSA-NEXT: s_mov_b32 s17, s15 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 @@ -8454,73 +8392,75 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s47 -; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s40 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s41 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s41 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s44 +; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s41 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s47 +; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s25 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18 ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70 @@ -8532,19 +8472,17 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s36 @@ -8559,10 +8497,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8615,20 +8553,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX8-NOHSA-NEXT: s_mov_b32 s47, 0 -; GFX8-NOHSA-NEXT: s_mov_b32 s49, s47 -; GFX8-NOHSA-NEXT: s_mov_b32 s59, s47 -; GFX8-NOHSA-NEXT: s_mov_b32 s63, s47 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX8-NOHSA-NEXT: s_mov_b32 s37, s47 -; GFX8-NOHSA-NEXT: s_mov_b32 s29, s47 -; GFX8-NOHSA-NEXT: s_mov_b32 s19, s47 -; GFX8-NOHSA-NEXT: s_mov_b32 s65, s47 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s48, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s50, s7 ; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s6, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s6, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 8 @@ -8679,8 +8609,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 ; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xf0 @@ -8694,10 +8624,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 ; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -9027,94 +8957,86 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s39, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s41, s39 -; GFX12-NEXT: s_mov_b32 s25, s39 -; GFX12-NEXT: s_mov_b32 s43, s39 -; GFX12-NEXT: s_mov_b32 s45, s39 -; GFX12-NEXT: s_mov_b32 s35, s39 -; GFX12-NEXT: s_mov_b32 s19, s39 -; GFX12-NEXT: s_mov_b32 s15, s39 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s38, s7, 16 -; GFX12-NEXT: s_lshr_b32 s46, s7, 8 +; GFX12-NEXT: s_lshr_b32 s36, s7, 16 +; GFX12-NEXT: s_lshr_b32 s38, s7, 8 ; GFX12-NEXT: s_mov_b32 s40, s7 -; GFX12-NEXT: s_lshr_b32 s48, s6, 16 -; GFX12-NEXT: s_lshr_b32 s50, s6, 24 +; GFX12-NEXT: s_lshr_b32 s42, s6, 16 +; GFX12-NEXT: s_lshr_b32 s44, s6, 24 ; GFX12-NEXT: s_ashr_i64 s[74:75], s[6:7], 56 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX12-NEXT: s_lshr_b32 s52, s6, 8 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: s_lshr_b32 s46, s6, 8 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s39 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s38 :: v_dual_mov_b32 v3, s75 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75 ; GFX12-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41 -; GFX12-NEXT: s_lshr_b32 s24, s5, 16 +; GFX12-NEXT: s_lshr_b32 s48, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s47 -; GFX12-NEXT: v_dual_mov_b32 v6, s46 :: v_dual_mov_b32 v9, s49 -; GFX12-NEXT: s_lshr_b32 s54, s5, 8 -; GFX12-NEXT: s_mov_b32 s42, s5 -; GFX12-NEXT: v_dual_mov_b32 v8, s48 :: v_dual_mov_b32 v11, s51 -; GFX12-NEXT: v_dual_mov_b32 v10, s50 :: v_dual_mov_b32 v13, s73 -; GFX12-NEXT: s_lshr_b32 s56, s4, 16 -; GFX12-NEXT: s_lshr_b32 s58, s4, 24 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s39 +; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s43 +; GFX12-NEXT: s_lshr_b32 s50, s5, 8 +; GFX12-NEXT: s_mov_b32 s52, s5 +; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 +; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s73 +; GFX12-NEXT: s_lshr_b32 s54, s4, 16 +; GFX12-NEXT: s_lshr_b32 s56, s4, 24 ; GFX12-NEXT: s_ashr_i64 s[70:71], s[4:5], 56 -; GFX12-NEXT: v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s53 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v14, s52 -; GFX12-NEXT: s_lshr_b32 s60, s4, 8 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX12-NEXT: s_lshr_b32 s44, s3, 16 -; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s47 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v14, s46 +; GFX12-NEXT: s_lshr_b32 s58, s4, 8 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX12-NEXT: s_lshr_b32 s60, s3, 16 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 -; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v0, s36 ; GFX12-NEXT: v_dual_mov_b32 v3, s71 :: v_dual_mov_b32 v2, s70 -; GFX12-NEXT: v_mov_b32_e32 v5, s43 -; GFX12-NEXT: s_lshr_b32 s36, s3, 8 -; GFX12-NEXT: s_mov_b32 s34, s3 -; GFX12-NEXT: s_lshr_b32 s26, s2, 16 +; GFX12-NEXT: v_mov_b32_e32 v5, s53 +; GFX12-NEXT: s_lshr_b32 s34, s3, 8 +; GFX12-NEXT: s_mov_b32 s30, s3 +; GFX12-NEXT: s_lshr_b32 s24, s2, 16 ; GFX12-NEXT: s_lshr_b32 s22, s2, 24 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s42 :: v_dual_mov_b32 v7, s55 -; GFX12-NEXT: v_dual_mov_b32 v6, s54 :: v_dual_mov_b32 v9, s57 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s51 +; GFX12-NEXT: v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s55 ; GFX12-NEXT: s_lshr_b32 s20, s2, 8 -; GFX12-NEXT: s_ashr_i64 s[28:29], s[2:3], 56 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 -; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31 +; GFX12-NEXT: s_ashr_i64 s[26:27], s[2:3], 56 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s57 +; GFX12-NEXT: v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s29 ; GFX12-NEXT: s_lshr_b32 s18, s1, 16 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61 -; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s45 -; GFX12-NEXT: s_lshr_b32 s62, s1, 8 -; GFX12-NEXT: s_mov_b32 s14, s1 +; GFX12-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s59 +; GFX12-NEXT: v_dual_mov_b32 v14, s58 :: v_dual_mov_b32 v17, s61 +; GFX12-NEXT: s_lshr_b32 s14, s1, 8 +; GFX12-NEXT: s_mov_b32 s62, s1 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s44 :: v_dual_mov_b32 v19, s29 -; GFX12-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v21, s35 +; GFX12-NEXT: v_dual_mov_b32 v16, s60 :: v_dual_mov_b32 v19, s27 +; GFX12-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v21, s31 ; GFX12-NEXT: s_lshr_b32 s64, s0, 16 ; GFX12-NEXT: s_lshr_b32 s66, s0, 24 ; GFX12-NEXT: s_ashr_i64 s[12:13], s[0:1], 56 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37 -; GFX12-NEXT: v_mov_b32_e32 v22, s36 +; GFX12-NEXT: v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s35 +; GFX12-NEXT: v_mov_b32_e32 v22, s34 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 @@ -9122,12 +9044,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s26 +; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24 ; GFX12-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 ; GFX12-NEXT: v_mov_b32_e32 v5, s17 ; GFX12-NEXT: s_lshr_b32 s68, s0, 8 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s21 ; GFX12-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v9, s19 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll index bea1ca1bc05b5..5fc1a87e71a1a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll @@ -112,9 +112,8 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 23, v0 -; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[1:2] +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 23, v0 +; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[0:1] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: range_metadata_sext_i8_signed_range_i64: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 5b94398908a56..c5c95380fde9b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -463,13 +463,12 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { ; SI-LABEL: mad_i64_i32_sextops_i31_i63: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v0 -; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: v_ashr_i64 v[6:7], v[4:5], 33 -; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v1 -; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 33 -; SI-NEXT: v_mul_lo_u32 v1, v6, v0 -; SI-NEXT: v_mul_hi_i32 v4, v6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33 +; SI-NEXT: v_mul_lo_u32 v1, v4, v0 +; SI-NEXT: v_mul_hi_i32 v4, v4, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 9b733a1c6012f..11cf129b1e479 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -464,20 +464,19 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s2, s[4:5], 0xb -; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dword s4, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s2, 8 -; SI-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_lshl_b32 s7, s4, 8 +; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s6, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s6, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_mul_i32 s5, s4, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -485,18 +484,17 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s3, s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s3, 8 -; VI-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 -; VI-NEXT: s_lshl_b32 s3, s6, 8 +; VI-NEXT: s_lshl_b32 s3, s2, 8 +; VI-NEXT: s_lshl_b32 s5, s6, 8 +; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 -; VI-NEXT: v_mul_i32_i24_e32 v0, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 +; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] @@ -508,20 +506,19 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s8, 8 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s5, s6, s4 -; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -613,17 +610,16 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s8, 8 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s4, s6, s4 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 885c0829a88c3..a166c4f93462d 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -461,12 +461,12 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -491,12 +491,12 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -684,19 +684,19 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_sub_i32 s4, 0, s9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-NEXT: s_sub_i32 s2, 0, s9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s1, s2, s8 -; GCN-NEXT: s_ashr_i32 s1, s1, 31 -; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 ; GCN-NEXT: s_abs_i32 s0, s2 +; GCN-NEXT: s_xor_b32 s1, s2, s8 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 @@ -720,7 +720,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -728,19 +728,19 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 ; GCN-IR-NEXT: s_abs_i32 s0, s2 +; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 @@ -772,12 +772,12 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -802,12 +802,12 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -838,7 +838,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -846,19 +846,19 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_sub_i32 s4, 0, s9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-NEXT: s_sub_i32 s2, 0, s9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s1, s2, s8 -; GCN-NEXT: s_ashr_i32 s1, s1, 31 -; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 ; GCN-NEXT: s_abs_i32 s0, s2 +; GCN-NEXT: s_xor_b32 s1, s2, s8 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 @@ -882,7 +882,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -890,19 +890,19 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 ; GCN-IR-NEXT: s_abs_i32 s0, s2 +; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 9b937e6524559..65a17ed67481c 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -149,12 +149,11 @@ define i128 @v_lshr_i128_kv(i128 %rhs) { ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, 0x41 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -168,11 +167,10 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index f7e0b7ea79437..c9e5ff444f715 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -443,12 +443,12 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -478,12 +478,12 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 @@ -520,12 +520,12 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -555,12 +555,12 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 @@ -650,7 +650,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -658,16 +658,16 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_sub_i32 s4, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_abs_i32 s3, s2 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-NEXT: s_abs_i32 s3, s2 ; GCN-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -691,7 +691,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -699,16 +699,16 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_abs_i32 s3, s2 -; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-IR-NEXT: s_abs_i32 s3, s2 ; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -739,7 +739,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -747,16 +747,16 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_sub_i32 s4, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_abs_i32 s3, s2 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-NEXT: s_abs_i32 s3, s2 ; GCN-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -780,7 +780,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -788,16 +788,16 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_abs_i32 s3, s2 -; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-IR-NEXT: s_abs_i32 s3, s2 ; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll index 8c3f2880f22a2..4d091c2302658 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -26,7 +26,7 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: vmov.32 d9[1], r5 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: mvn r12, #0 +; CHECK-NEXT: mvn r4, #0 ; CHECK-NEXT: movwlt r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: mvnne r5, #0 @@ -34,6 +34,7 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: vmov.32 d8[1], r1 ; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmov.i32 q10, #0x80000000 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vdup.32 d19, r5 @@ -42,24 +43,22 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: vdup.32 d18, r0 ; CHECK-NEXT: vbit q8, q4, q9 -; CHECK-NEXT: adr r4, .LCPI0_1 -; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128] ; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: vmov r3, r5, d16 ; CHECK-NEXT: rsbs r0, r0, #-2147483648 -; CHECK-NEXT: sbcs r0, r12, r1 +; CHECK-NEXT: sbcs r0, r4, r1 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: rsbs r1, r3, #-2147483648 -; CHECK-NEXT: sbcs r1, r12, r5 -; CHECK-NEXT: vdup.32 d21, r0 +; CHECK-NEXT: sbcs r1, r4, r5 +; CHECK-NEXT: vdup.32 d19, r0 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d20, r2 -; CHECK-NEXT: vbif q8, q9, q10 +; CHECK-NEXT: vdup.32 d18, r2 +; CHECK-NEXT: vbif q8, q10, q9 ; CHECK-NEXT: vmovn.i64 d0, q8 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r11, pc} @@ -70,11 +69,6 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 2147483648 @ 0x80000000 -; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .long 2147483648 @ 0x80000000 -; CHECK-NEXT: .long 4294967295 @ 0xffffffff entry: %conv = fptosi <2 x double> %x to <2 x i64> %0 = icmp slt <2 x i64> %conv, @@ -100,22 +94,20 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: vmov.32 d9[0], r4 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mvn r3, #0 ; CHECK-NEXT: subs r4, r4, r3 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vmov.32 d9[1], r5 ; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movwlo r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: mvnne r5, #0 ; CHECK-NEXT: subs r0, r0, r3 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov.32 d8[1], r1 +; CHECK-NEXT: vdup.32 d17, r5 ; CHECK-NEXT: movwlo r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vdup.32 d17, r5 ; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: vdup.32 d16, r2 ; CHECK-NEXT: vand q9, q4, q8 @@ -177,11 +169,11 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: rsbs r1, r3, #0 ; CHECK-NEXT: rscs r1, r5, #0 -; CHECK-NEXT: vdup.32 d19, r0 +; CHECK-NEXT: vmov.32 d19[0], r0 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d18, r2 +; CHECK-NEXT: vmov.32 d18[0], r2 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vmovn.i64 d0, q8 ; CHECK-NEXT: vpop {d8, d9} @@ -332,61 +324,57 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vorr q4, q0, q0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: vmov.32 d9[0], r8 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: vmov r5, s19 +; CHECK-NEXT: vmov r7, s18 +; CHECK-NEXT: vmov.32 d9[0], r10 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: vmov.32 d10[0], r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: vmov.32 d8[0], r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov.32 d11[0], r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mvn r3, #0 +; CHECK-NEXT: vmov.32 d10[0], r0 ; CHECK-NEXT: subs r0, r0, r3 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov.32 d9[1], r9 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlo r0, #1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov.32 d8[1], r1 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: subs r1, r8, r3 -; CHECK-NEXT: sbcs r1, r9, #0 -; CHECK-NEXT: vmov.32 d11[1], r5 +; CHECK-NEXT: subs r1, r5, r3 +; CHECK-NEXT: sbcs r1, r4, #0 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movwlo r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: subs r6, r4, r3 -; CHECK-NEXT: sbcs r6, r5, #0 +; CHECK-NEXT: subs r7, r10, r3 +; CHECK-NEXT: sbcs r7, r8, #0 ; CHECK-NEXT: vdup.32 d19, r1 -; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: vdup.32 d18, r0 -; CHECK-NEXT: movwlo r6, #1 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: subs r3, r10, r3 -; CHECK-NEXT: sbcs r3, r7, #0 -; CHECK-NEXT: vmov.32 d10[1], r7 +; CHECK-NEXT: movwlo r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: subs r3, r6, r3 +; CHECK-NEXT: sbcs r3, r9, #0 +; CHECK-NEXT: vdup.32 d17, r7 ; CHECK-NEXT: movwlo r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vdup.32 d17, r6 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vand q10, q4, q9 +; CHECK-NEXT: vand q10, q5, q9 ; CHECK-NEXT: vdup.32 d16, r2 -; CHECK-NEXT: vand q11, q5, q8 +; CHECK-NEXT: vand q11, q4, q8 ; CHECK-NEXT: vorn q9, q10, q9 ; CHECK-NEXT: vorn q8, q11, q8 ; CHECK-NEXT: vmovn.i64 d1, q9 @@ -409,46 +397,45 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vorr q4, q0, q0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.32 d17[0], r2 -; CHECK-NEXT: mvn r4, #0 -; CHECK-NEXT: subs r2, r2, r4 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmov r5, s17 ; CHECK-NEXT: vmov r8, s16 -; CHECK-NEXT: vmov.32 d16[0], r5 -; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: vmov.32 d17[1], r1 -; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: movwlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: subs r2, r5, r4 -; CHECK-NEXT: sbcs r2, r6, #0 -; CHECK-NEXT: vdup.32 d19, r1 +; CHECK-NEXT: vmov.32 d9[0], r6 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: mvn r9, #0 +; CHECK-NEXT: subs r2, r6, r9 +; CHECK-NEXT: sbcs r2, r7, #0 +; CHECK-NEXT: vmov.32 d8[0], r0 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vmov.32 d16[1], r6 +; CHECK-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d18, r2 -; CHECK-NEXT: vorr q4, q9, q9 -; CHECK-NEXT: vbsl q4, q8, q5 -; CHECK-NEXT: vmov r10, r9, d8 +; CHECK-NEXT: subs r0, r0, r9 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov.32 d9[1], r7 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov.32 d8[1], r1 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d17, r2 +; CHECK-NEXT: vdup.32 d16, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: vbif q4, q5, q8 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov.32 d13[0], r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: vmov r7, r10, d8 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: subs r2, r5, r4 +; CHECK-NEXT: subs r2, r5, r9 ; CHECK-NEXT: vmov.32 d12[0], r0 ; CHECK-NEXT: sbcs r2, r6, #0 ; CHECK-NEXT: mov r2, #0 @@ -456,25 +443,25 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: subs r0, r0, r9 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov.32 d12[1], r1 +; CHECK-NEXT: vdup.32 d17, r2 ; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmov.32 d12[1], r1 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d17, r2 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: vdup.32 d16, r0 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: rsbs r7, r7, #0 ; CHECK-NEXT: vbsl q8, q6, q5 -; CHECK-NEXT: rsbs r6, r10, #0 -; CHECK-NEXT: rscs r6, r9, #0 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: movwlt r6, #1 -; CHECK-NEXT: vmov r5, r4, d16 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: rscs r7, r10, #0 +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: movwlt r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: vmov r6, r5, d16 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rscs r0, r1, #0 ; CHECK-NEXT: mov r0, #0 @@ -483,20 +470,20 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: rscs r1, r3, #0 -; CHECK-NEXT: vdup.32 d21, r0 +; CHECK-NEXT: vmov.32 d19[0], r0 ; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: vdup.32 d20, r6 ; CHECK-NEXT: movwlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: rsbs r2, r5, #0 -; CHECK-NEXT: rscs r2, r4, #0 -; CHECK-NEXT: vdup.32 d19, r1 -; CHECK-NEXT: movwlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: rsbs r0, r6, #0 +; CHECK-NEXT: rscs r0, r5, #0 +; CHECK-NEXT: vmov.32 d21[0], r1 +; CHECK-NEXT: movwlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: vmov.32 d20[0], r7 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vmov.32 d18[0], r4 ; CHECK-NEXT: vand q10, q4, q10 -; CHECK-NEXT: vdup.32 d18, r7 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vmovn.i64 d1, q10 ; CHECK-NEXT: vmovn.i64 d0, q8 @@ -764,151 +751,141 @@ entry: define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NEON-LABEL: utesth_f16i32: ; CHECK-NEON: @ %bb.0: @ %entry -; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEON-NEXT: .vsave {d12, d13} +; CHECK-NEON-NEXT: vpush {d12, d13} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10} +; CHECK-NEON-NEXT: vpush {d8, d9, d10} ; CHECK-NEON-NEXT: vmov r0, s3 ; CHECK-NEON-NEXT: vmov.f32 s16, s2 ; CHECK-NEON-NEXT: vmov.f32 s18, s1 ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r8, r0 +; CHECK-NEON-NEXT: mov r4, r0 +; CHECK-NEON-NEXT: vmov r0, s18 +; CHECK-NEON-NEXT: mov r8, r1 +; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: bl __aeabi_f2ulz +; CHECK-NEON-NEXT: mov r6, r0 +; CHECK-NEON-NEXT: vmov.32 d13[0], r0 ; CHECK-NEON-NEXT: vmov r0, s20 ; CHECK-NEON-NEXT: mov r9, r1 -; CHECK-NEON-NEXT: vmov r4, s18 -; CHECK-NEON-NEXT: vmov r6, s16 -; CHECK-NEON-NEXT: vmov.32 d9[0], r8 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r10, r0 -; CHECK-NEON-NEXT: vmov.32 d10[0], r0 -; CHECK-NEON-NEXT: mov r0, r4 +; CHECK-NEON-NEXT: mov r5, r0 +; CHECK-NEON-NEXT: vmov.32 d12[0], r0 +; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: mov r7, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: vmov.32 d9[0], r4 ; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: mov r4, r0 -; CHECK-NEON-NEXT: vmov.32 d11[0], r0 -; CHECK-NEON-NEXT: mov r0, r6 -; CHECK-NEON-NEXT: mov r5, r1 -; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: bl __aeabi_f2ulz -; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: mvn r3, #0 +; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: subs r0, r0, r3 ; CHECK-NEON-NEXT: mov r2, #0 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vmov.32 d9[1], r9 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlo r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 -; CHECK-NEON-NEXT: vmov.32 d8[1], r1 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: subs r1, r8, r3 -; CHECK-NEON-NEXT: sbcs r1, r9, #0 -; CHECK-NEON-NEXT: vmov.32 d11[1], r5 +; CHECK-NEON-NEXT: subs r1, r4, r3 +; CHECK-NEON-NEXT: sbcs r1, r8, #0 ; CHECK-NEON-NEXT: mov r1, #0 ; CHECK-NEON-NEXT: movwlo r1, #1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: subs r6, r4, r3 -; CHECK-NEON-NEXT: sbcs r6, r5, #0 +; CHECK-NEON-NEXT: subs r6, r6, r3 +; CHECK-NEON-NEXT: sbcs r6, r9, #0 ; CHECK-NEON-NEXT: vdup.32 d19, r1 ; CHECK-NEON-NEXT: mov r6, #0 ; CHECK-NEON-NEXT: vdup.32 d18, r0 ; CHECK-NEON-NEXT: movwlo r6, #1 ; CHECK-NEON-NEXT: cmp r6, #0 ; CHECK-NEON-NEXT: mvnne r6, #0 -; CHECK-NEON-NEXT: subs r3, r10, r3 +; CHECK-NEON-NEXT: subs r3, r5, r3 ; CHECK-NEON-NEXT: sbcs r3, r7, #0 -; CHECK-NEON-NEXT: vmov.32 d10[1], r7 +; CHECK-NEON-NEXT: vdup.32 d17, r6 ; CHECK-NEON-NEXT: movwlo r2, #1 ; CHECK-NEON-NEXT: cmp r2, #0 -; CHECK-NEON-NEXT: vdup.32 d17, r6 ; CHECK-NEON-NEXT: mvnne r2, #0 ; CHECK-NEON-NEXT: vand q10, q4, q9 ; CHECK-NEON-NEXT: vdup.32 d16, r2 -; CHECK-NEON-NEXT: vand q11, q5, q8 +; CHECK-NEON-NEXT: vand q11, q6, q8 ; CHECK-NEON-NEXT: vorn q9, q10, q9 ; CHECK-NEON-NEXT: vorn q8, q11, q8 ; CHECK-NEON-NEXT: vmovn.i64 d1, q9 ; CHECK-NEON-NEXT: vmovn.i64 d0, q8 -; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEON-NEXT: vpop {d8, d9, d10} +; CHECK-NEON-NEXT: vpop {d12, d13} +; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} ; ; CHECK-FP16-LABEL: utesth_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry ; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-FP16-NEXT: .vsave {d10, d11, d12, d13} -; CHECK-FP16-NEXT: vpush {d10, d11, d12, d13} -; CHECK-FP16-NEXT: .vsave {d8} -; CHECK-FP16-NEXT: vpush {d8} -; CHECK-FP16-NEXT: vmov.u16 r0, d0[3] +; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11} +; CHECK-FP16-NEXT: vmov.u16 r0, d0[1] ; CHECK-FP16-NEXT: vorr d8, d0, d0 -; CHECK-FP16-NEXT: vmov.u16 r4, d0[1] +; CHECK-FP16-NEXT: vmov.u16 r5, d0[3] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: mov r6, r0 +; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] -; CHECK-FP16-NEXT: mov r7, r1 -; CHECK-FP16-NEXT: vmov.32 d11[0], r6 +; CHECK-FP16-NEXT: mov r8, r1 +; CHECK-FP16-NEXT: vmov.32 d11[0], r4 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: vmov s0, r4 -; CHECK-FP16-NEXT: mov r8, r0 +; CHECK-FP16-NEXT: vmov s0, r5 +; CHECK-FP16-NEXT: mov r6, r0 ; CHECK-FP16-NEXT: mov r9, r1 -; CHECK-FP16-NEXT: vmov.32 d12[0], r0 +; CHECK-FP16-NEXT: vmov.32 d10[0], r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: mov r4, r0 +; CHECK-FP16-NEXT: mov r5, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] -; CHECK-FP16-NEXT: mov r5, r1 -; CHECK-FP16-NEXT: vmov.32 d13[0], r4 +; CHECK-FP16-NEXT: mov r7, r1 +; CHECK-FP16-NEXT: vmov.32 d9[0], r5 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixunshfdi -; CHECK-FP16-NEXT: vmov.32 d10[0], r0 ; CHECK-FP16-NEXT: mvn r3, #0 +; CHECK-FP16-NEXT: vmov.32 d8[0], r0 ; CHECK-FP16-NEXT: subs r0, r0, r3 ; CHECK-FP16-NEXT: mov r2, #0 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 -; CHECK-FP16-NEXT: vmov.32 d11[1], r7 ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: movwlo r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 -; CHECK-FP16-NEXT: vmov.32 d10[1], r1 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: subs r1, r6, r3 +; CHECK-FP16-NEXT: subs r1, r5, r3 ; CHECK-FP16-NEXT: sbcs r1, r7, #0 -; CHECK-FP16-NEXT: vmov.32 d13[1], r5 ; CHECK-FP16-NEXT: mov r1, #0 ; CHECK-FP16-NEXT: movwlo r1, #1 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 ; CHECK-FP16-NEXT: subs r7, r4, r3 -; CHECK-FP16-NEXT: sbcs r7, r5, #0 +; CHECK-FP16-NEXT: sbcs r7, r8, #0 ; CHECK-FP16-NEXT: vdup.32 d19, r1 ; CHECK-FP16-NEXT: mov r7, #0 ; CHECK-FP16-NEXT: vdup.32 d18, r0 ; CHECK-FP16-NEXT: movwlo r7, #1 ; CHECK-FP16-NEXT: cmp r7, #0 ; CHECK-FP16-NEXT: mvnne r7, #0 -; CHECK-FP16-NEXT: subs r3, r8, r3 +; CHECK-FP16-NEXT: subs r3, r6, r3 ; CHECK-FP16-NEXT: sbcs r3, r9, #0 -; CHECK-FP16-NEXT: vmov.32 d12[1], r9 +; CHECK-FP16-NEXT: vdup.32 d17, r7 ; CHECK-FP16-NEXT: movwlo r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 -; CHECK-FP16-NEXT: vdup.32 d17, r7 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: vand q10, q5, q9 +; CHECK-FP16-NEXT: vand q10, q4, q9 ; CHECK-FP16-NEXT: vdup.32 d16, r2 -; CHECK-FP16-NEXT: vand q11, q6, q8 +; CHECK-FP16-NEXT: vand q11, q5, q8 ; CHECK-FP16-NEXT: vorn q9, q10, q9 ; CHECK-FP16-NEXT: vorn q8, q11, q8 ; CHECK-FP16-NEXT: vmovn.i64 d1, q9 ; CHECK-FP16-NEXT: vmovn.i64 d0, q8 -; CHECK-FP16-NEXT: vpop {d8} -; CHECK-FP16-NEXT: vpop {d10, d11, d12, d13} +; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11} ; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} entry: %conv = fptoui <4 x half> %x to <4 x i64> @@ -925,8 +902,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEON-NEXT: vmov r0, s2 -; CHECK-NEON-NEXT: vmov.f32 s16, s3 +; CHECK-NEON-NEXT: vmov r0, s3 +; CHECK-NEON-NEXT: vmov.f32 s16, s2 ; CHECK-NEON-NEXT: vmov.f32 s18, s1 ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f @@ -935,43 +912,42 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s16 ; CHECK-NEON-NEXT: mov r6, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: vmov r8, s20 +; CHECK-NEON-NEXT: vmov.32 d13[0], r5 ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mov r2, r0 -; CHECK-NEON-NEXT: vmov r0, s18 -; CHECK-NEON-NEXT: vmov.32 d17[0], r2 -; CHECK-NEON-NEXT: mvn r8, #0 -; CHECK-NEON-NEXT: subs r2, r2, r8 -; CHECK-NEON-NEXT: vmov r4, s20 -; CHECK-NEON-NEXT: vmov.32 d16[0], r5 -; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEON-NEXT: vmov r2, s18 +; CHECK-NEON-NEXT: vmov.32 d12[0], r0 +; CHECK-NEON-NEXT: mvn r9, #0 +; CHECK-NEON-NEXT: subs r0, r0, r9 +; CHECK-NEON-NEXT: sbcs r0, r1, #0 +; CHECK-NEON-NEXT: vmov.32 d13[1], r6 +; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: mov r7, #0 -; CHECK-NEON-NEXT: vmov.32 d17[1], r1 -; CHECK-NEON-NEXT: sbcs r1, r1, #0 +; CHECK-NEON-NEXT: movwlt r0, #1 +; CHECK-NEON-NEXT: cmp r0, #0 +; CHECK-NEON-NEXT: vmov.32 d12[1], r1 +; CHECK-NEON-NEXT: mvnne r0, #0 +; CHECK-NEON-NEXT: subs r1, r5, r9 +; CHECK-NEON-NEXT: sbcs r1, r6, #0 ; CHECK-NEON-NEXT: mov r1, #0 ; CHECK-NEON-NEXT: movwlt r1, #1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: subs r2, r5, r8 -; CHECK-NEON-NEXT: sbcs r2, r6, #0 -; CHECK-NEON-NEXT: vdup.32 d19, r1 -; CHECK-NEON-NEXT: mov r2, #0 -; CHECK-NEON-NEXT: vmov.32 d16[1], r6 -; CHECK-NEON-NEXT: movwlt r2, #1 -; CHECK-NEON-NEXT: cmp r2, #0 -; CHECK-NEON-NEXT: mvnne r2, #0 -; CHECK-NEON-NEXT: vdup.32 d18, r2 -; CHECK-NEON-NEXT: vorr q4, q9, q9 -; CHECK-NEON-NEXT: vbsl q4, q8, q5 -; CHECK-NEON-NEXT: vmov r10, r9, d8 +; CHECK-NEON-NEXT: vdup.32 d9, r1 +; CHECK-NEON-NEXT: vdup.32 d8, r0 +; CHECK-NEON-NEXT: mov r0, r2 ; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEON-NEXT: vbsl q4, q6, q5 ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: mov r5, r0 ; CHECK-NEON-NEXT: vmov.32 d13[0], r0 -; CHECK-NEON-NEXT: mov r0, r4 +; CHECK-NEON-NEXT: mov r0, r8 ; CHECK-NEON-NEXT: mov r6, r1 +; CHECK-NEON-NEXT: vmov r4, r10, d8 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: subs r2, r5, r8 +; CHECK-NEON-NEXT: subs r2, r5, r9 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 ; CHECK-NEON-NEXT: sbcs r2, r6, #0 ; CHECK-NEON-NEXT: mov r2, #0 @@ -979,25 +955,25 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: movwlt r2, #1 ; CHECK-NEON-NEXT: cmp r2, #0 ; CHECK-NEON-NEXT: mvnne r2, #0 -; CHECK-NEON-NEXT: subs r0, r0, r8 +; CHECK-NEON-NEXT: subs r0, r0, r9 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vmov.32 d12[1], r1 +; CHECK-NEON-NEXT: vdup.32 d17, r2 ; CHECK-NEON-NEXT: mov r0, #0 +; CHECK-NEON-NEXT: vmov.32 d12[1], r1 ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: vdup.32 d17, r2 +; CHECK-NEON-NEXT: vmov r2, r3, d9 ; CHECK-NEON-NEXT: vdup.32 d16, r0 -; CHECK-NEON-NEXT: vmov r0, r1, d9 +; CHECK-NEON-NEXT: rsbs r6, r4, #0 ; CHECK-NEON-NEXT: vbsl q8, q6, q5 -; CHECK-NEON-NEXT: rsbs r6, r10, #0 -; CHECK-NEON-NEXT: rscs r6, r9, #0 +; CHECK-NEON-NEXT: rscs r6, r10, #0 ; CHECK-NEON-NEXT: mov r6, #0 -; CHECK-NEON-NEXT: vmov r2, r3, d17 ; CHECK-NEON-NEXT: movwlt r6, #1 -; CHECK-NEON-NEXT: vmov r5, r4, d16 ; CHECK-NEON-NEXT: cmp r6, #0 +; CHECK-NEON-NEXT: vmov r0, r1, d17 ; CHECK-NEON-NEXT: mvnne r6, #0 +; CHECK-NEON-NEXT: vmov r5, r4, d16 ; CHECK-NEON-NEXT: rsbs r0, r0, #0 ; CHECK-NEON-NEXT: rscs r0, r1, #0 ; CHECK-NEON-NEXT: mov r0, #0 @@ -1006,20 +982,20 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-NEXT: mvnne r0, #0 ; CHECK-NEON-NEXT: rsbs r1, r2, #0 ; CHECK-NEON-NEXT: rscs r1, r3, #0 -; CHECK-NEON-NEXT: vdup.32 d21, r0 +; CHECK-NEON-NEXT: vmov.32 d19[0], r0 ; CHECK-NEON-NEXT: mov r1, #0 -; CHECK-NEON-NEXT: vdup.32 d20, r6 ; CHECK-NEON-NEXT: movwlt r1, #1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvnne r1, #0 -; CHECK-NEON-NEXT: rsbs r2, r5, #0 -; CHECK-NEON-NEXT: rscs r2, r4, #0 -; CHECK-NEON-NEXT: vdup.32 d19, r1 +; CHECK-NEON-NEXT: rsbs r0, r5, #0 +; CHECK-NEON-NEXT: rscs r0, r4, #0 +; CHECK-NEON-NEXT: vmov.32 d21[0], r1 ; CHECK-NEON-NEXT: movwlt r7, #1 ; CHECK-NEON-NEXT: cmp r7, #0 +; CHECK-NEON-NEXT: vmov.32 d20[0], r6 ; CHECK-NEON-NEXT: mvnne r7, #0 +; CHECK-NEON-NEXT: vmov.32 d18[0], r7 ; CHECK-NEON-NEXT: vand q10, q4, q10 -; CHECK-NEON-NEXT: vdup.32 d18, r7 ; CHECK-NEON-NEXT: vand q8, q8, q9 ; CHECK-NEON-NEXT: vmovn.i64 d1, q10 ; CHECK-NEON-NEXT: vmovn.i64 d0, q8 @@ -1028,78 +1004,78 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; ; CHECK-FP16-LABEL: ustest_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[3] ; CHECK-FP16-NEXT: vorr d8, d0, d0 -; CHECK-FP16-NEXT: vmov.u16 r8, d0[1] +; CHECK-FP16-NEXT: vmov.u16 r8, d0[0] +; CHECK-FP16-NEXT: vmov.u16 r9, d0[1] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] ; CHECK-FP16-NEXT: mov r5, r1 -; CHECK-FP16-NEXT: vmov.32 d11[0], r4 +; CHECK-FP16-NEXT: vmov.32 d9[0], r4 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: mvn r7, #0 -; CHECK-FP16-NEXT: subs r2, r4, r7 +; CHECK-FP16-NEXT: mvn r10, #0 +; CHECK-FP16-NEXT: subs r2, r4, r10 ; CHECK-FP16-NEXT: sbcs r2, r5, #0 -; CHECK-FP16-NEXT: vmov.32 d10[0], r0 +; CHECK-FP16-NEXT: vmov.32 d8[0], r0 ; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: vmov.i64 q6, #0xffffffff +; CHECK-FP16-NEXT: vmov s0, r9 ; CHECK-FP16-NEXT: movwlt r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: subs r0, r0, r7 +; CHECK-FP16-NEXT: subs r0, r0, r10 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 -; CHECK-FP16-NEXT: vmov.32 d11[1], r5 +; CHECK-FP16-NEXT: vmov.32 d9[1], r5 ; CHECK-FP16-NEXT: mov r0, #0 -; CHECK-FP16-NEXT: vmov s0, r8 +; CHECK-FP16-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 -; CHECK-FP16-NEXT: vmov.32 d10[1], r1 +; CHECK-FP16-NEXT: vmov.32 d8[1], r1 ; CHECK-FP16-NEXT: mvnne r0, #0 ; CHECK-FP16-NEXT: mov r6, #0 ; CHECK-FP16-NEXT: vdup.32 d17, r2 ; CHECK-FP16-NEXT: vdup.32 d16, r0 -; CHECK-FP16-NEXT: vbif q5, q6, q8 -; CHECK-FP16-NEXT: vmov r9, r8, d10 +; CHECK-FP16-NEXT: vbif q4, q5, q8 ; CHECK-FP16-NEXT: bl __fixhfdi +; CHECK-FP16-NEXT: vmov s0, r8 ; CHECK-FP16-NEXT: mov r4, r0 -; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] ; CHECK-FP16-NEXT: mov r5, r1 -; CHECK-FP16-NEXT: vmov.32 d9[0], r4 -; CHECK-FP16-NEXT: vmov s0, r0 +; CHECK-FP16-NEXT: vmov.32 d13[0], r0 +; CHECK-FP16-NEXT: vmov r7, r8, d8 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: subs r2, r4, r7 -; CHECK-FP16-NEXT: vmov.32 d8[0], r0 +; CHECK-FP16-NEXT: subs r2, r4, r10 +; CHECK-FP16-NEXT: vmov.32 d12[0], r0 ; CHECK-FP16-NEXT: sbcs r2, r5, #0 ; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: vmov.32 d9[1], r5 +; CHECK-FP16-NEXT: vmov.32 d13[1], r5 ; CHECK-FP16-NEXT: movwlt r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: subs r0, r0, r7 +; CHECK-FP16-NEXT: subs r0, r0, r10 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 -; CHECK-FP16-NEXT: vmov.32 d8[1], r1 +; CHECK-FP16-NEXT: vdup.32 d17, r2 ; CHECK-FP16-NEXT: mov r0, #0 +; CHECK-FP16-NEXT: vmov.32 d12[1], r1 ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: vdup.32 d17, r2 +; CHECK-FP16-NEXT: vmov r2, r3, d9 ; CHECK-FP16-NEXT: vdup.32 d16, r0 -; CHECK-FP16-NEXT: vmov r0, r1, d11 -; CHECK-FP16-NEXT: vbsl q8, q4, q6 -; CHECK-FP16-NEXT: rsbs r7, r9, #0 +; CHECK-FP16-NEXT: rsbs r7, r7, #0 +; CHECK-FP16-NEXT: vbsl q8, q6, q5 ; CHECK-FP16-NEXT: rscs r7, r8, #0 ; CHECK-FP16-NEXT: mov r7, #0 -; CHECK-FP16-NEXT: vmov r2, r3, d17 ; CHECK-FP16-NEXT: movwlt r7, #1 -; CHECK-FP16-NEXT: vmov r5, r4, d16 ; CHECK-FP16-NEXT: cmp r7, #0 +; CHECK-FP16-NEXT: vmov r0, r1, d17 ; CHECK-FP16-NEXT: mvnne r7, #0 +; CHECK-FP16-NEXT: vmov r5, r4, d16 ; CHECK-FP16-NEXT: rsbs r0, r0, #0 ; CHECK-FP16-NEXT: rscs r0, r1, #0 ; CHECK-FP16-NEXT: mov r0, #0 @@ -1108,25 +1084,25 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-FP16-NEXT: mvnne r0, #0 ; CHECK-FP16-NEXT: rsbs r1, r2, #0 ; CHECK-FP16-NEXT: rscs r1, r3, #0 -; CHECK-FP16-NEXT: vdup.32 d21, r0 +; CHECK-FP16-NEXT: vmov.32 d19[0], r0 ; CHECK-FP16-NEXT: mov r1, #0 -; CHECK-FP16-NEXT: vdup.32 d20, r7 ; CHECK-FP16-NEXT: movwlt r1, #1 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 -; CHECK-FP16-NEXT: rsbs r2, r5, #0 -; CHECK-FP16-NEXT: rscs r2, r4, #0 -; CHECK-FP16-NEXT: vdup.32 d19, r1 +; CHECK-FP16-NEXT: rsbs r0, r5, #0 +; CHECK-FP16-NEXT: rscs r0, r4, #0 +; CHECK-FP16-NEXT: vmov.32 d21[0], r1 ; CHECK-FP16-NEXT: movwlt r6, #1 ; CHECK-FP16-NEXT: cmp r6, #0 +; CHECK-FP16-NEXT: vmov.32 d20[0], r7 ; CHECK-FP16-NEXT: mvnne r6, #0 -; CHECK-FP16-NEXT: vand q10, q5, q10 -; CHECK-FP16-NEXT: vdup.32 d18, r6 +; CHECK-FP16-NEXT: vmov.32 d18[0], r6 +; CHECK-FP16-NEXT: vand q10, q4, q10 ; CHECK-FP16-NEXT: vand q8, q8, q9 ; CHECK-FP16-NEXT: vmovn.i64 d1, q10 ; CHECK-FP16-NEXT: vmovn.i64 d0, q8 ; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll index 0134ee48ad421..742f2a75a1aa8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -279,42 +279,40 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.u16 r0, q4[2] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: rsbs r2, r0, #0 ; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: sbcs.w r2, r6, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: sbcs.w r0, r6, r1 -; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: bfi r3, r0, #8, #8 -; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: sbcs.w r0, r6, r5 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 -; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmov.u16 r0, q4[0] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r4, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: sbcs.w r2, r6, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: sbcs.w r0, r6, r1 -; CHECK-NEXT: bfi r6, r2, #0, #8 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: rsbs r1, r4, #0 +; CHECK-NEXT: sbcs.w r1, r6, r5 +; CHECK-NEXT: bfi r6, r0, #0, #8 ; CHECK-NEXT: csetm r0, lt ; CHECK-NEXT: bfi r6, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r6 @@ -1353,42 +1351,40 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.u16 r0, q4[2] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r4, #0 +; CHECK-NEXT: rsbs r2, r0, #0 ; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: sbcs.w r2, r6, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: sbcs.w r0, r6, r1 -; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: csetm r0, lt -; CHECK-NEXT: bfi r3, r0, #8, #8 -; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: sbcs.w r0, r6, r5 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 -; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r1, r0, #8, #8 +; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q6, q0, q5 ; CHECK-NEXT: bl __fixhfdi ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmov.u16 r0, q4[0] ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __fixhfdi -; CHECK-NEXT: rsbs r2, r4, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: sbcs.w r2, r6, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: sbcs.w r0, r6, r1 -; CHECK-NEXT: bfi r6, r2, #0, #8 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: rsbs r1, r4, #0 +; CHECK-NEXT: sbcs.w r1, r6, r5 +; CHECK-NEXT: bfi r6, r0, #0, #8 ; CHECK-NEXT: csetm r0, lt ; CHECK-NEXT: bfi r6, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r6 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index 5ac88c581f33c..b4a2aa7a1ed1b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -33,7 +33,12 @@ define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(ptr %base, ptr %offptr) { ; CHECK-LABEL: unscaled_v2i8_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r1] +; CHECK-NEXT: vmov.i32 q0, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ldrb r1, [r0, r1] ; CHECK-NEXT: ldrb r0, [r0, r2] ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index fe5d7f29f01ff..acbe48f9e5927 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -302,37 +302,35 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov r10, s8 ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r10, s2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: adds.w r6, r10, r2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: asr.w r0, r10, #31 +; CHECK-NEXT: adds.w r6, r10, r2 +; CHECK-NEXT: eor.w r7, r10, r2 ; CHECK-NEXT: adc r3, r0, #0 -; CHECK-NEXT: eor.w r1, r10, r2 ; CHECK-NEXT: asrl r6, r3, r2 ; CHECK-NEXT: subs r0, r6, r2 -; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: vmov r6, s2 ; CHECK-NEXT: sbc lr, r3, #0 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: umull r0, r8, r0, r2 -; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: asrs r5, r6, #31 ; CHECK-NEXT: adds r4, r6, r3 -; CHECK-NEXT: asr.w r7, r6, #31 -; CHECK-NEXT: adc r5, r7, #0 -; CHECK-NEXT: eor.w r7, r6, r3 +; CHECK-NEXT: adc r5, r5, #0 +; CHECK-NEXT: eor.w r1, r6, r3 ; CHECK-NEXT: asrl r4, r5, r3 ; CHECK-NEXT: subs r4, r4, r3 ; CHECK-NEXT: sbc r5, r5, #0 -; CHECK-NEXT: orrs.w r7, r7, r6, asr #31 +; CHECK-NEXT: orrs.w r7, r7, r10, asr #31 ; CHECK-NEXT: umull r4, r12, r4, r3 ; CHECK-NEXT: csetm r9, eq -; CHECK-NEXT: orrs.w r1, r1, r10, asr #31 +; CHECK-NEXT: orrs.w r1, r1, r6, asr #31 ; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: csetm r1, eq ; CHECK-NEXT: bfi r7, r9, #0, #8 @@ -345,49 +343,47 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-NEXT: rsb.w r1, r10, #0 ; CHECK-NEXT: lsll r4, r5, r3 ; CHECK-NEXT: lsll r0, r7, r1 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: lsll r0, r7, r2 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q3[3], q3[1], r5, r7 ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: adds r4, r0, r1 -; CHECK-NEXT: asr.w r2, r0, #31 -; CHECK-NEXT: adc r3, r2, #0 -; CHECK-NEXT: asrl r4, r3, r1 -; CHECK-NEXT: subs r2, r4, r1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: sbc r8, r3, #0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r2, lr, r2, r1 -; CHECK-NEXT: adds r6, r4, r3 -; CHECK-NEXT: asr.w r5, r4, #31 +; CHECK-NEXT: adds r2, r3, r1 +; CHECK-NEXT: asr.w r0, r3, #31 +; CHECK-NEXT: adc r5, r0, #0 +; CHECK-NEXT: asrl r2, r5, r1 +; CHECK-NEXT: subs r0, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sbc r8, r5, #0 +; CHECK-NEXT: umull r4, lr, r0, r1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: adds r6, r2, r0 ; CHECK-NEXT: adc r7, r5, #0 ; CHECK-NEXT: mla r5, r8, r1, lr -; CHECK-NEXT: asrl r6, r7, r3 -; CHECK-NEXT: subs.w r8, r6, r3 -; CHECK-NEXT: eor.w r6, r4, r3 +; CHECK-NEXT: asrl r6, r7, r0 +; CHECK-NEXT: subs.w r8, r6, r0 +; CHECK-NEXT: eor.w r6, r2, r0 ; CHECK-NEXT: sbc lr, r7, #0 -; CHECK-NEXT: eor.w r7, r0, r1 -; CHECK-NEXT: orrs.w r6, r6, r4, asr #31 -; CHECK-NEXT: orr.w r7, r7, r0, asr #31 +; CHECK-NEXT: eor.w r7, r3, r1 +; CHECK-NEXT: orrs.w r6, r6, r2, asr #31 +; CHECK-NEXT: orr.w r7, r7, r3, asr #31 ; CHECK-NEXT: csetm r6, eq ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: bfi r12, r6, #0, #8 ; CHECK-NEXT: csetm r6, eq ; CHECK-NEXT: bfi r12, r6, #8, #8 -; CHECK-NEXT: umull r6, r7, r8, r3 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: lsll r2, r5, r0 -; CHECK-NEXT: rsbs r0, r4, #0 -; CHECK-NEXT: mla r7, lr, r3, r7 -; CHECK-NEXT: lsll r2, r5, r1 +; CHECK-NEXT: umull r6, r7, r8, r0 +; CHECK-NEXT: rsb.w r8, r3, #0 +; CHECK-NEXT: lsll r4, r5, r8 ; CHECK-NEXT: vmsr p0, r12 -; CHECK-NEXT: lsll r6, r7, r0 -; CHECK-NEXT: lsll r6, r7, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 +; CHECK-NEXT: mla r3, lr, r0, r7 +; CHECK-NEXT: lsll r4, r5, r1 +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: lsll r6, r3, r1 +; CHECK-NEXT: lsll r6, r3, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s8 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll index b8d9670710a00..0bec2b100911c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -420,7 +420,6 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2f64(<2 x double> %src) { ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: csetm r0, eq ; CHECK-MVE-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-MVE-NEXT: vmov q0[3], q0[1], r0, r6 ; CHECK-MVE-NEXT: vand q0, q0, q4 ; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 5972a9a7cf934..29b56639bd769 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -11,63 +11,59 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: bne .LBB0_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r11, r1 -; CHECK-NEXT: mov r10, r5 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r10, r11 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %vector.ph ; CHECK-NEXT: bic r2, r3, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: subs r7, r2, #2 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adr r4, .LCPI0_1 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r10, r11, r2, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r10, r5, r2, lsl #2 -; CHECK-NEXT: add.w r11, r1, r2, lsl #2 +; CHECK-NEXT: add.w r8, r1, r2, lsl #2 ; CHECK-NEXT: add.w r12, r0, r2, lsl #2 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrd r4, r2, [r0], #8 -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: ldrd r7, r8, [r1], #8 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: ldrd r7, r6, [r1], #8 ; CHECK-NEXT: smull r4, r7, r7, r4 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 -; CHECK-NEXT: sbcs r3, r7 -; CHECK-NEXT: mov r9, r5 +; CHECK-NEXT: mov.w r9, #-1 +; CHECK-NEXT: sbcs.w r3, r9, r7 ; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: bfi r5, r3, #0, #8 -; CHECK-NEXT: smull r2, r3, r8, r2 +; CHECK-NEXT: smull r2, r3, r6, r2 ; CHECK-NEXT: asrl r2, r3, #31 ; CHECK-NEXT: rsbs.w r6, r2, #-2147483648 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: mov.w r6, #-1 +; CHECK-NEXT: sbcs.w r6, r9, r3 ; CHECK-NEXT: vmov q2[3], q2[1], r7, r3 -; CHECK-NEXT: sbcs r6, r3 ; CHECK-NEXT: csetm r6, lt ; CHECK-NEXT: bfi r5, r6, #8, #8 -; CHECK-NEXT: mvn r6, #-2147483648 ; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mvn r5, #-2147483648 ; CHECK-NEXT: vpsel q2, q2, q0 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: subs r2, r2, r6 +; CHECK-NEXT: subs r2, r2, r5 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: csetm r2, lt ; CHECK-NEXT: bfi r3, r2, #0, #8 ; CHECK-NEXT: vmov r2, r4, d5 -; CHECK-NEXT: subs r2, r2, r6 +; CHECK-NEXT: subs r2, r2, r5 ; CHECK-NEXT: sbcs r2, r4, #0 ; CHECK-NEXT: csetm r2, lt ; CHECK-NEXT: bfi r3, r2, #8, #8 @@ -75,8 +71,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vpsel q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: strd r3, r2, [r5] -; CHECK-NEXT: add.w r5, r9, #8 +; CHECK-NEXT: strd r3, r2, [r11], #8 ; CHECK-NEXT: le lr, .LBB0_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload @@ -90,7 +85,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r2, [r12], #4 -; CHECK-NEXT: ldr r4, [r11], #4 +; CHECK-NEXT: ldr r4, [r8], #4 ; CHECK-NEXT: smull r2, r5, r4, r2 ; CHECK-NEXT: asrl r2, r5, #31 ; CHECK-NEXT: subs r4, r1, r2 @@ -112,11 +107,6 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .long 4294967295 @ 0xffffffff ; CHECK-NEXT: .long 2147483648 @ 0x80000000 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 entry: switch i32 %N, label %vector.ph [ i32 0, label %for.cond.cleanup @@ -613,57 +603,56 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %entry +; CHECK-NEXT: mov r8, r2 ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: b .LBB3_6 ; CHECK-NEXT: .LBB3_3: @ %vector.ph -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bic r3, r3, #1 -; CHECK-NEXT: subs r7, r3, #2 +; CHECK-NEXT: bic r5, r3, #1 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r11, r2, r3, lsl #2 +; CHECK-NEXT: subs r7, r5, #2 +; CHECK-NEXT: str r5, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r2, r8, r5, lsl #2 +; CHECK-NEXT: add.w r11, r1, r5, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 -; CHECK-NEXT: add.w r10, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: add.w r12, r0, r5, lsl #2 +; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r6, [r0], #8 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: ldrd r7, r3, [r1], #8 -; CHECK-NEXT: umull r4, r9, r7, r4 -; CHECK-NEXT: lsrl r4, r9, #31 -; CHECK-NEXT: subs.w r5, r4, #-1 -; CHECK-NEXT: sbcs r5, r9, #0 +; CHECK-NEXT: ldrd r4, r9, [r0], #8 +; CHECK-NEXT: ldrd r5, r10, [r1], #8 +; CHECK-NEXT: umull r4, r5, r5, r4 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: subs.w r6, r4, #-1 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r5, lo -; CHECK-NEXT: bfi r8, r5, #0, #8 -; CHECK-NEXT: umull r6, r5, r3, r6 -; CHECK-NEXT: lsrl r6, r5, #31 -; CHECK-NEXT: subs.w r7, r6, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r6 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r9, r5 -; CHECK-NEXT: csetm r3, lo -; CHECK-NEXT: bfi r8, r3, #8, #8 -; CHECK-NEXT: vmsr p0, r8 +; CHECK-NEXT: bfi r6, r5, #0, #8 +; CHECK-NEXT: umull r10, r5, r10, r9 +; CHECK-NEXT: lsrl r10, r5, #31 +; CHECK-NEXT: subs.w r7, r10, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r10 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csetm r5, lo +; CHECK-NEXT: bfi r6, r5, #8, #8 +; CHECK-NEXT: vmsr p0, r6 ; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: strd r4, r3, [r2], #8 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: strd r5, r4, [r8], #8 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload ; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader @@ -671,17 +660,17 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .LBB3_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r1, [r10], #4 +; CHECK-NEXT: ldr r1, [r11], #4 ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: lsrl r0, r1, #31 -; CHECK-NEXT: subs.w r2, r0, #-1 +; CHECK-NEXT: subs.w r3, r0, #-1 ; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r11], #4 +; CHECK-NEXT: str r0, [r2], #4 ; CHECK-NEXT: le lr, .LBB3_7 ; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: switch i32 %N, label %vector.ph [ @@ -761,78 +750,69 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB4_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB4_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov r10, r1 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_3: @ %vector.ph -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r7, r3, #4 +; CHECK-NEXT: bic r8, r3, #3 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r10, r1, r3, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: add.w r7, r2, r3, lsl #2 -; CHECK-NEXT: str r7, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: sub.w r7, r8, #4 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: add.w r11, r2, r8, lsl #2 +; CHECK-NEXT: add.w r9, r1, r8, lsl #2 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r12, r0, r8, lsl #2 ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmullb.u32 q5, q4, q3 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r4, r9, d10 -; CHECK-NEXT: lsrl r4, r9, #31 +; CHECK-NEXT: vmov r10, r5, d10 +; CHECK-NEXT: lsrl r10, r5, #31 ; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: subs.w r5, r4, #-1 -; CHECK-NEXT: sbcs r5, r9, #0 -; CHECK-NEXT: vmullb.u32 q4, q2, q1 +; CHECK-NEXT: subs.w r6, r10, #-1 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r5, lo +; CHECK-NEXT: vmullb.u32 q4, q2, q1 ; CHECK-NEXT: bfi r6, r5, #0, #8 -; CHECK-NEXT: vmov r8, r5, d11 -; CHECK-NEXT: lsrl r8, r5, #31 -; CHECK-NEXT: subs.w r11, r8, #-1 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r8 -; CHECK-NEXT: sbcs r7, r5, #0 -; CHECK-NEXT: vmov q3[3], q3[1], r9, r5 -; CHECK-NEXT: csetm r7, lo -; CHECK-NEXT: bfi r6, r7, #8, #8 -; CHECK-NEXT: vmov r4, r7, d8 -; CHECK-NEXT: lsrl r4, r7, #31 +; CHECK-NEXT: vmov r4, r5, d11 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: subs.w r7, r4, #-1 +; CHECK-NEXT: vmov q3[2], q3[0], r10, r4 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csetm r5, lo +; CHECK-NEXT: bfi r6, r5, #8, #8 +; CHECK-NEXT: vmov r10, r5, d8 +; CHECK-NEXT: lsrl r10, r5, #31 ; CHECK-NEXT: vmsr p0, r6 -; CHECK-NEXT: subs.w r5, r4, #-1 -; CHECK-NEXT: mov.w r6, #0 -; CHECK-NEXT: sbcs r5, r7, #0 +; CHECK-NEXT: subs.w r6, r10, #-1 ; CHECK-NEXT: vpsel q3, q3, q0 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r5, lo ; CHECK-NEXT: bfi r6, r5, #0, #8 -; CHECK-NEXT: vmov r2, r5, d9 -; CHECK-NEXT: lsrl r2, r5, #31 -; CHECK-NEXT: subs.w r3, r2, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 -; CHECK-NEXT: csetm r3, lo -; CHECK-NEXT: bfi r6, r3, #8, #8 +; CHECK-NEXT: vmov r4, r5, d9 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: subs.w r7, r4, #-1 +; CHECK-NEXT: vmov q1[2], q1[0], r10, r4 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csetm r5, lo +; CHECK-NEXT: bfi r6, r5, #8, #8 ; CHECK-NEXT: vmsr p0, r6 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vpsel q1, q1, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s12 @@ -840,26 +820,23 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB4_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r7, r3, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: beq .LBB4_8 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r7 +; CHECK-NEXT: sub.w lr, r3, r8 ; CHECK-NEXT: .LBB4_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 -; CHECK-NEXT: ldr r2, [r10], #4 -; CHECK-NEXT: umull r0, r3, r2, r0 -; CHECK-NEXT: lsrl r0, r3, #31 +; CHECK-NEXT: ldr r1, [r9], #4 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: lsrl r0, r1, #31 ; CHECK-NEXT: subs.w r2, r0, #-1 -; CHECK-NEXT: sbcs r2, r3, #0 +; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it hs ; CHECK-NEXT: movhs.w r0, #-1 -; CHECK-NEXT: str r0, [r1], #4 +; CHECK-NEXT: str r0, [r11], #4 ; CHECK-NEXT: le lr, .LBB4_7 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll index d49973a674a21..87df13787c6c8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -35,10 +35,15 @@ entry: define arm_aapcs_vfpcc void @unscaled_v2i8_i8(ptr %base, ptr %offptr, <2 x i8> %input) { ; CHECK-LABEL: unscaled_v2i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: ldrb r2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: strb r3, [r0, r2] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: strb r2, [r0, r1] +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: strb r2, [r0, r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index d2d3912fec65c..f9948db66b3b3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -443,7 +443,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q2, #0xffff +; CHECK-NEXT: vmov.i32 q2, #0xffff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s4 @@ -1363,7 +1363,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q2, #0xff +; CHECK-NEXT: vmov.i32 q2, #0xff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s4 @@ -1870,7 +1870,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q2, #0xffff +; CHECK-NEXT: vmov.i32 q2, #0xffff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s4 @@ -2544,7 +2544,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q2, #0xff +; CHECK-NEXT: vmov.i32 q2, #0xff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index 76a15f4459afe..63b1431ac0fa4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -393,7 +393,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q3, #0xffff +; CHECK-NEXT: vmov.i32 q3, #0xffff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s4 @@ -1587,7 +1587,7 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q3, #0xff +; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s4 @@ -2020,7 +2020,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q3, #0xffff +; CHECK-NEXT: vmov.i32 q3, #0xffff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s8 @@ -2915,7 +2915,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q3, #0xff +; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmov r2, s8 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index c83514dbe7de2..c1ef500d9d3de 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -5638,10 +5638,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mask_mul_epu32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] -; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] +; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmb: @@ -5660,12 +5657,9 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> ; X86-LABEL: test_mask_mul_epu32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] -; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -5687,12 +5681,9 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) ; X86-LABEL: test_mask_mul_epu32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] -; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmbkz: @@ -7386,10 +7377,7 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mul_epu32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] -; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] +; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mul_epu32_rmb: @@ -7408,12 +7396,9 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass ; X86-LABEL: test_mul_epu32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] -; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -7437,12 +7422,9 @@ define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) { ; X86-LABEL: test_mul_epu32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] -; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mul_epu32_rmbkz: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4ef485b916fe4..e8e22bae23c92 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -11548,11 +11548,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mask_mul_epu32_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; X86-NEXT: vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a] -; X86-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1] +; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmb_128: @@ -11571,13 +11567,9 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x ; X86-LABEL: test_mask_mul_epu32_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10] -; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; X86-NEXT: vpblendd $10, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x02,0xd3,0x0a] -; X86-NEXT: # xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11599,13 +11591,9 @@ define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, ptr %ptr_b, i8 % ; X86-LABEL: test_mask_mul_epu32_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; X86-NEXT: vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a] -; X86-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmbkz_128: @@ -11728,11 +11716,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, ptr %ptr_b) { ; X86-LABEL: test_mask_mul_epu32_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; X86-NEXT: vpblendd $170, %ymm2, %ymm1, %ymm1 # encoding: [0xc4,0xe3,0x75,0x02,0xca,0xaa] -; X86-NEXT: # ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1] +; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmb_256: @@ -11751,13 +11735,9 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x ; X86-LABEL: test_mask_mul_epu32_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10] -; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; X86-NEXT: vpblendd $170, %ymm3, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x02,0xd3,0xaa] -; X86-NEXT: # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11779,13 +11759,9 @@ define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, ptr %ptr_b, i8 % ; X86-LABEL: test_mask_mul_epu32_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; X86-NEXT: vpblendd $170, %ymm2, %ymm1, %ymm1 # encoding: [0xc4,0xe3,0x75,0x02,0xca,0xaa] -; X86-NEXT: # ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mul_epu32_rmbkz_256: diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index eff6bcfe570a1..1a2aac657d30f 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -542,14 +542,16 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -565,14 +567,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq @@ -618,14 +622,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -641,14 +647,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll index 295a2eab029ec..473fecc307ed4 100644 --- a/llvm/test/CodeGen/X86/known-signbits-shl.ll +++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll @@ -70,8 +70,7 @@ define void @computeNumSignBits_shl_zext_vec_1(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: psubb %xmm1, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2048,8192,u,u,u,u,u,u] ; X64-NEXT: movd %xmm0, (%rdi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 5368934fa5bf1..45b61155fe626 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -192,10 +192,8 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind { ; X86-LABEL: signbits_ashr_shl_extract_sitofp: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpsrad $31, %xmm0, %xmm1 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-NEXT: vpsrad $29, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-NEXT: vpsllq $20, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) @@ -203,25 +201,13 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; -; X64-AVX1-LABEL: signbits_ashr_shl_extract_sitofp: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX1-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; X64-AVX1-NEXT: vpsllq $20, %xmm0, %xmm0 -; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: signbits_ashr_shl_extract_sitofp: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX2-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; X64-AVX2-NEXT: vpsllq $20, %xmm0, %xmm0 -; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-LABEL: signbits_ashr_shl_extract_sitofp: +; X64: # %bb.0: +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpsrad $29, %xmm0, %xmm0 +; X64-NEXT: vpsllq $20, %xmm0, %xmm0 +; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shl <2 x i64> %1, %3 = extractelement <2 x i64> %2, i32 0 @@ -473,10 +459,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x ; ; X64-AVX2-LABEL: signbits_ashr_sext_select_shuffle_sitofp: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrad $31, %ymm2, %ymm4 ; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] ; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2 -; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll index d5ceff5709974..9298a6becf6d3 100644 --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -152,7 +152,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2147483648,1] +; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = [1,2147483648] ; SSE4-NEXT: pmuludq %xmm0, %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; SSE4-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll index 286015840d4c7..cf1fa5a8fc493 100644 --- a/llvm/test/CodeGen/X86/pr42727.ll +++ b/llvm/test/CodeGen/X86/pr42727.ll @@ -7,7 +7,7 @@ define void @_ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_(ptr byval ; CHECK-LABEL: _ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: vpbroadcastd (%eax), %xmm1 +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vpsllq $56, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%eax) diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll index 36783d10552a5..cec6f370af0e7 100644 --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -220,12 +220,10 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 5f147784a74a3..6b94303902d6c 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1864,9 +1864,11 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 +; X86-SSE-NEXT: movd %ecx, %xmm1 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: psllq $32, %xmm1 +; X86-SSE-NEXT: movq %xmm1, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst3: @@ -1885,9 +1887,11 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: psllq $32, %xmm1 +; X64-SSE-NEXT: movq %xmm1, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst3: @@ -1923,9 +1927,11 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 +; X86-SSE-NEXT: movd %ecx, %xmm1 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: psllq $32, %xmm1 +; X86-SSE-NEXT: movq %xmm1, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst4: @@ -1945,9 +1951,11 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: psllq $32, %xmm1 +; X64-SSE-NEXT: movq %xmm1, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index bd19fa16a994b..08d9183bd30b6 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1098,7 +1098,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,u,1,u] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] @@ -1354,7 +1354,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -2068,12 +2068,11 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm1, %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2140,12 +2139,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm1, %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 5fcb80549fcc8..97cc1f8a15694 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -137,12 +137,14 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE2-NEXT: movd %edx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,1463,819,u] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] ; SSE2-NEXT: movdqa %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 11808ea1def8e..dcd680169ddc5 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -493,7 +493,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -935,7 +935,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [1,1,2,1] +; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1175,7 +1175,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1845,7 +1845,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1915,7 +1915,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,1] +; CHECK-SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 0567cb2ac74b6..1a1a50689c87f 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -435,12 +435,12 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd %r9d, %xmm2 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm5, %xmm5 @@ -467,26 +467,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: paddd %xmm7, %xmm4 -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pmuludq %xmm7, %xmm0 +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: psubd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movq %xmm1, 16(%rcx) ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: movq %xmm1, 16(%rdi) ; SSE2-NEXT: movdqa %xmm6, (%rdi) @@ -509,12 +506,12 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd %r9d, %xmm2 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pxor %xmm5, %xmm5 @@ -541,26 +538,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 ; SSSE3-NEXT: pxor %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm2, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm1, %xmm4 ; SSSE3-NEXT: paddd %xmm7, %xmm4 -; SSSE3-NEXT: pmuludq %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] -; SSSE3-NEXT: pmuludq %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: psubd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSSE3-NEXT: pmuludq %xmm7, %xmm0 +; SSSE3-NEXT: pmuludq %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: pmuludq %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: psubd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movq %xmm1, 16(%rcx) ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm5, %xmm1 ; SSSE3-NEXT: movq %xmm1, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm6, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 2b76f02b57e3c..1df40e773246a 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -366,9 +366,9 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -376,42 +376,37 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd %esi, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: movd %r9d, %xmm1 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] -; SSE2-NEXT: pmuludq %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE2-NEXT: pmuludq %xmm8, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movq %xmm4, 16(%rdi) -; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: movq %xmm7, 16(%rdi) +; SSE2-NEXT: movdqa %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v6i32: @@ -421,9 +416,9 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -431,42 +426,37 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd %esi, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-NEXT: pmuludq %xmm2, %xmm0 +; SSSE3-NEXT: movd %r9d, %xmm1 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] -; SSSE3-NEXT: pmuludq %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pxor %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSSE3-NEXT: pmuludq %xmm8, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: movdqa %xmm3, (%rcx) -; SSSE3-NEXT: movq %xmm4, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: movq %xmm7, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm1, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v6i32: diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 57bca8f4ee3e0..fbfbc45411792 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -1393,29 +1393,29 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: mul_v2i64_neg_15_63: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlq $32, %xmm3 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: paddq %xmm3, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: psrlq $32, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE2-NEXT: paddq %xmm1, %xmm2 +; X86-SSE2-NEXT: psllq $32, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 ; X86-SSE2-NEXT: paddq %xmm2, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-SSE4-LABEL: mul_v2i64_neg_15_63: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 ; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE4-NEXT: psrlq $32, %xmm3 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: paddq %xmm3, %xmm0 -; X86-SSE4-NEXT: psllq $32, %xmm0 +; X86-SSE4-NEXT: psrlq $32, %xmm2 +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553] +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE4-NEXT: paddq %xmm1, %xmm2 +; X86-SSE4-NEXT: psllq $32, %xmm2 +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 ; X86-SSE4-NEXT: paddq %xmm2, %xmm0 ; X86-SSE4-NEXT: retl ; @@ -1482,29 +1482,29 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: mul_v2i64_neg_17_65: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlq $32, %xmm3 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: paddq %xmm3, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: psrlq $32, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE2-NEXT: paddq %xmm1, %xmm2 +; X86-SSE2-NEXT: psllq $32, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 ; X86-SSE2-NEXT: paddq %xmm2, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-SSE4-LABEL: mul_v2i64_neg_17_65: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 ; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE4-NEXT: psrlq $32, %xmm3 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: paddq %xmm3, %xmm0 -; X86-SSE4-NEXT: psllq $32, %xmm0 +; X86-SSE4-NEXT: psrlq $32, %xmm2 +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551] +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE4-NEXT: paddq %xmm1, %xmm2 +; X86-SSE4-NEXT: psllq $32, %xmm2 +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 ; X86-SSE4-NEXT: paddq %xmm2, %xmm0 ; X86-SSE4-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index c3e9a2b6841ae..54dc107fd0c10 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -3176,10 +3176,11 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> % ; ; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: From 99dde85e2e24633d3f6ad37e65ff3872bcc708df Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Thu, 22 May 2025 13:11:26 +0200 Subject: [PATCH 3/3] Add DoNotPoisonEltMask to SimplifyDemandedVectorEltsForTargetNode I don't know much about those X86 instructions. Tried to handle DoNotPoisonEltMask for several X86ISD nodes, but there is a bunch of situation when I just treated the DoNotPoisonEltMask as being included in DemandedElts. If taking X86ISD::VBROADCAST as an example it is a bit tricky. That one can use widenSubVector to create a vector with UNDEF elements. But that helper is using getNode(ISD::INSERT_SUBVECTOR) which actually may return a vector with POISON elements. So then we need to consider also the "do not poison elements" as being demanded. If we fix https://github.com/llvm/llvm-project/issues/141034 then maybe we can trust widenSubVector to not return a more poisonous vector, and simplify based on just DemandedElts. --- llvm/include/llvm/CodeGen/TargetLowering.h | 3 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 7 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 116 ++++++++++++------ llvm/lib/Target/X86/X86ISelLowering.h | 1 + .../X86/avx512-intrinsics-fast-isel.ll | 4 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 23 ++-- llvm/test/CodeGen/X86/combine-udiv.ll | 5 +- .../CodeGen/X86/f16c-intrinsics-fast-isel.ll | 4 - .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 87 ++++++------- llvm/test/CodeGen/X86/pr41619.ll | 2 - llvm/test/CodeGen/X86/shrink_vmul.ll | 2 - llvm/test/CodeGen/X86/test-shrink-bug.ll | 2 +- llvm/test/CodeGen/X86/vector-fshl-128.ll | 42 ++----- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 41 ++----- llvm/test/CodeGen/X86/vector-fshr-128.ll | 2 - llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 6 +- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 96 ++++++--------- llvm/test/CodeGen/X86/vector-rotate-128.ll | 41 ++----- .../X86/vector-shuffle-combining-avx.ll | 12 +- .../X86/vector-shuffle-combining-ssse3.ll | 9 +- llvm/test/CodeGen/X86/vselect.ll | 26 ++-- 21 files changed, 252 insertions(+), 279 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 719cb472b785c..ab241ce1800ba 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4278,7 +4278,8 @@ class TargetLowering : public TargetLoweringBase { /// (used to simplify the caller). The KnownUndef/Zero elements may only be /// accurate for those bits in the DemandedMask. virtual bool SimplifyDemandedVectorEltsForTargetNode( - SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, + SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask, + APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const; /// Attempt to simplify any target nodes based on the demanded bits/elts, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 0dbe7eba4875b..1d019f2073618 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3817,7 +3817,9 @@ bool TargetLowering::SimplifyDemandedVectorElts( return false; default: { if (Op.getOpcode() >= ISD::BUILTIN_OP_END) { - if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts | DoNotPoisonEltMask, KnownUndef, + if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, + DoNotPoisonEltMask, + KnownUndef, KnownZero, TLO, Depth)) return true; } else { @@ -3898,7 +3900,8 @@ unsigned TargetLowering::computeNumSignBitsForTargetInstr( } bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode( - SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, + SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask, + APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f04603867a587..21f8d5c598037 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43294,7 +43294,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle( } bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( - SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, + SDValue Op, const APInt &DemandedElts, + const APInt &DoNotPoisonEltMask, + APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const { int NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); @@ -43308,10 +43310,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask, + LHSUndef, LHSZero, TLO, Depth + 1)) return true; - if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask, + RHSUndef, RHSZero, TLO, Depth + 1)) return true; // Multiply by zero. @@ -43325,11 +43329,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts); + APInt DoNotPoisonSrcElts = APIntOps::ScaleBitMask(DoNotPoisonEltMask, + 2 * NumElts); - if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, DoNotPoisonSrcElts, + LHSUndef, LHSZero, TLO, Depth + 1)) return true; - if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, DoNotPoisonSrcElts, + RHSUndef, RHSZero, TLO, Depth + 1)) return true; @@ -43337,11 +43345,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent. APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero; - if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, DoNotPoisonSrcElts, + LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero; - if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, DoNotPoisonSrcElts, + RHSUndef, RHSZero, TLO, Depth + 1)) return true; break; @@ -43357,7 +43367,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Aggressively peek through ops to get at the demanded elts. if (!DemandedElts.isAllOnes()) { unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); - APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); + APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts | DoNotPoisonEltMask, NumSrcElts); SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts( LHS, DemandedSrcElts, TLO.DAG, Depth + 1); SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts( @@ -43401,7 +43411,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); APInt SrcUndef; - if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO, + if (SimplifyDemandedVectorElts(Src, DemandedElts, DoNotPoisonEltMask, + SrcUndef, KnownZero, TLO, Depth + 1)) return true; @@ -43413,7 +43424,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Aggressively peek through ops to get at the demanded elts. if (!DemandedElts.isAllOnes()) if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( - Src, DemandedElts, TLO.DAG, Depth + 1)) + Src, DemandedElts | DoNotPoisonEltMask, TLO.DAG, Depth + 1)) return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1))); break; @@ -43427,7 +43438,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask, + LHSUndef, LHSZero, TLO, Depth + 1)) return true; @@ -43436,7 +43448,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return TLO.CombineTo( Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); - if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask, + RHSUndef, RHSZero, TLO, Depth + 1)) return true; @@ -43449,10 +43462,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask, LHSUndef, LHSZero, TLO, Depth + 1)) return true; - if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask, RHSUndef, RHSZero, TLO, Depth + 1)) return true; break; @@ -43487,7 +43500,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } APInt DemandedSrc = DemandedElts.lshr(ShiftAmt); - if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, + APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.lshr(ShiftAmt); + if (SimplifyDemandedVectorElts(Src, DemandedSrc, DoNotPoisonSrcElts, + KnownUndef, KnownZero, TLO, Depth + 1)) return true; @@ -43526,7 +43541,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } APInt DemandedSrc = DemandedElts.shl(ShiftAmt); - if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, + APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.shl(ShiftAmt); + if (SimplifyDemandedVectorElts(Src, DemandedSrc, DoNotPoisonSrcElts, + KnownUndef, KnownZero, TLO, Depth + 1)) return true; @@ -43540,19 +43557,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + // For now, treat "DoNotPoison" elements as demanded. + // FIXME: Handle DoNotPoisonEltMask better. + APInt DemandedEltsInclDoNotPoison = DemandedElts | DoNotPoisonEltMask; + auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { APInt UndefElts; SmallVector EltBits; int NumElts = VT.getVectorNumElements(); int EltSizeInBits = VT.getScalarSizeInBits(); APInt OpBits = APInt::getAllOnes(EltSizeInBits); - APInt OpElts = DemandedElts; + APInt OpElts = DemandedEltsInclDoNotPoison; if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) { OpBits.clearAllBits(); OpElts.clearAllBits(); for (int I = 0; I != NumElts; ++I) { - if (!DemandedElts[I]) + if (!DemandedEltsInclDoNotPoison[I]) continue; if (UndefElts[I]) { // We can't assume an undef src element gives an undef dst - the @@ -43604,7 +43625,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( EVT SrcVT = Src.getValueType(); APInt SrcUndef, SrcZero; APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); - if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, + APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.zextOrTrunc(SrcVT.getVectorNumElements()); + if (SimplifyDemandedVectorElts(Src, SrcElts, DoNotPoisonSrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; break; @@ -43614,8 +43636,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); + // For now, treat "DoNotPoison" elements as demanded. + // FIXME: Handle DoNotPoisonEltMask better. APInt DemandedLHS, DemandedRHS; - getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + getPackDemandedElts(VT, DemandedElts | DoNotPoisonEltMask, DemandedLHS, DemandedRHS); APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO, @@ -43651,8 +43675,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); + // For now, treat "DoNotPoison" elements as demanded. + // FIXME: Handle DoNotPoisonEltMask better. APInt DemandedLHS, DemandedRHS; - getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + getHorizDemandedElts(VT, DemandedElts | DoNotPoisonEltMask, DemandedLHS, DemandedRHS); APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO, @@ -43687,8 +43713,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); + APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.zextOrTrunc(SrcVT.getVectorNumElements()); APInt SrcUndef, SrcZero; - if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, + if (SimplifyDemandedVectorElts(Src, DemandedSrc, DoNotPoisonSrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; KnownZero = SrcZero.zextOrTrunc(NumElts); @@ -43700,24 +43727,24 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask); if (SDValue R = combineBlendOfPermutes( VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask, - DemandedElts, TLO.DAG, Subtarget, SDLoc(Op))) + DemandedElts | DoNotPoisonEltMask, TLO.DAG, Subtarget, SDLoc(Op))) return TLO.CombineTo(Op, R); break; } case X86ISD::BLENDV: { APInt SelUndef, SelZero; - if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, DoNotPoisonEltMask, SelUndef, SelZero, TLO, Depth + 1)) return true; // TODO: Use SelZero to adjust LHS/RHS DemandedElts. APInt LHSUndef, LHSZero; - if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, DoNotPoisonEltMask, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt RHSUndef, RHSZero; - if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, + if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, DoNotPoisonEltMask, RHSUndef, RHSZero, TLO, Depth + 1)) return true; @@ -43728,7 +43755,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::VZEXT_MOVL: { // If upper demanded elements are already zero then we have nothing to do. SDValue Src = Op.getOperand(0); - APInt DemandedUpperElts = DemandedElts; + APInt DemandedUpperElts = DemandedElts | DoNotPoisonEltMask; DemandedUpperElts.clearLowBits(1); if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1)) return TLO.CombineTo(Op, Src); @@ -43738,7 +43765,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // If upper demanded elements are not demanded then simplify to a // scalar_to_vector(load()). MVT SVT = VT.getSimpleVT().getVectorElementType(); - if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) { + if ((DemandedElts | DoNotPoisonEltMask) == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) { SDLoc DL(Op); auto *Mem = cast(Op); SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(), @@ -43749,10 +43776,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( break; } case X86ISD::VBROADCAST: { + // For now, treat "DoNotPoison" elements as demanded. + // FIXME: Handle DoNotPoisonEltMask better. + APInt DemandedEltsInclDoNotPoison = DemandedElts | DoNotPoisonEltMask; + SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); // Don't bother broadcasting if we just need the 0'th element. - if (DemandedElts == 1) { + if (DemandedEltsInclDoNotPoison == 1) { if (!SrcVT.isVector()) Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src); else if (Src.getValueType() != VT) @@ -43775,35 +43806,42 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( break; } case X86ISD::VPERMV: - if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO, + // FIXME: Handle DoNotPoisonEltMask better. + if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts | DoNotPoisonEltMask, 0, TLO, Depth)) return true; break; case X86ISD::PSHUFB: case X86ISD::VPERMV3: case X86ISD::VPERMILPV: - if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO, + // FIXME: Handle DoNotPoisonEltMask better. + if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts | DoNotPoisonEltMask, 1, TLO, Depth)) return true; break; case X86ISD::VPPERM: case X86ISD::VPERMIL2: - if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO, + // FIXME: Handle DoNotPoisonEltMask better. + if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts | DoNotPoisonEltMask, 2, TLO, Depth)) return true; break; } + // For now, treat "DoNotPoison" elements as demanded below. + // FIXME: Handle DoNotPoisonEltMask better. + APInt DemandedEltsInclDoNotPoison = DemandedElts | DoNotPoisonEltMask; + // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not // demand any of the high elements, then narrow the op to 128/256-bits: e.g. // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 if ((VT.is256BitVector() || VT.is512BitVector()) && - DemandedElts.lshr(NumElts / 2) == 0) { + DemandedEltsInclDoNotPoison.lshr(NumElts / 2) == 0) { unsigned SizeInBits = VT.getSizeInBits(); unsigned ExtSizeInBits = SizeInBits / 2; // See if 512-bit ops only use the bottom 128-bits. - if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) + if (VT.is512BitVector() && DemandedEltsInclDoNotPoison.lshr(NumElts / 4) == 0) ExtSizeInBits = SizeInBits / 4; switch (Opc) { @@ -44071,14 +44109,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // For splats, unless we *only* demand the 0'th element, // stop attempts at simplification here, we aren't going to improve things, // this is better than any potential shuffle. - if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false)) + if (!DemandedEltsInclDoNotPoison.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false)) return false; // Get target/faux shuffle mask. APInt OpUndef, OpZero; SmallVector OpMask; SmallVector OpInputs; - if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef, + if (!getTargetShuffleInputs(Op, DemandedEltsInclDoNotPoison, OpInputs, OpMask, OpUndef, OpZero, TLO.DAG, Depth, false)) return false; @@ -44096,7 +44134,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Check if shuffle mask can be simplified to undef/zero/identity. int NumSrcs = OpInputs.size(); for (int i = 0; i != NumElts; ++i) - if (!DemandedElts[i]) + if (!DemandedEltsInclDoNotPoison[i]) OpMask[i] = SM_SentinelUndef; if (isUndefInRange(OpMask, 0, NumElts)) { @@ -44121,7 +44159,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( int Lo = Src * NumElts; APInt SrcElts = APInt::getZero(NumElts); for (int i = 0; i != NumElts; ++i) - if (DemandedElts[i]) { + if (DemandedEltsInclDoNotPoison[i]) { int M = OpMask[i] - Lo; if (0 <= M && M < NumElts) SrcElts.setBit(M); @@ -44141,7 +44179,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // to match. This prevents combineX86ShuffleChain from returning a // combined shuffle that's the same as the original root, causing an // infinite loop. - if (!DemandedElts.isAllOnes()) { + if (!DemandedEltsInclDoNotPoison.isAllOnes()) { assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range"); SmallVector DemandedMask(NumElts, SM_SentinelUndef); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 662552a972249..db36467888882 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1302,6 +1302,7 @@ namespace llvm { bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, + const APInt &DoNotPoisonElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 5aac1554e6e3b..a8574c0b7516c 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6625,7 +6625,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 @@ -6833,7 +6833,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 045979afc1f53..6c78799732a82 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2187,15 +2187,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: paddw %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] -; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] ; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm4, %xmm4 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7] +; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: paddb %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -2223,15 +2222,15 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 6230c883cd7c5..55715197830b1 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -631,10 +631,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: psrlw $7, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll index cf56effeb348c..1886e2911ede8 100644 --- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll @@ -40,8 +40,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; X86-LABEL: test_cvtss_sh: ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X86-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -49,8 +47,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; ; X64-LABEL: test_cvtss_sh: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index e4df7e8d8877c..a619cb70460b7 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -141,59 +141,63 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-LABEL: fmul_pow2_8xhalf: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: subq $120, %rsp -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 128 -; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; CHECK-SSE-NEXT: pslld $23, %xmm2 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; CHECK-SSE-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; CHECK-SSE-NEXT: pslld $16, %xmm2 -; CHECK-SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-NEXT: subq $104, %rsp +; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112 +; CHECK-SSE-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; CHECK-SSE-NEXT: pslld $23, %xmm1 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-SSE-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE-NEXT: pslld $16, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd %xmm3, %xmm0 +; CHECK-SSE-NEXT: paddd %xmm2, %xmm0 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: pslld $16, %xmm0 ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: psrld $16, %xmm0 -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-SSE-NEXT: psrlq $48, %xmm0 ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrlq $48, %xmm0 +; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 +; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0 +; CHECK-SSE-NEXT: callq __truncsfhf2@PLT +; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-SSE-NEXT: psrld $16, %xmm0 -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-SSE-NEXT: psrlq $48, %xmm0 ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: psrlq $48, %xmm0 +; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT @@ -205,9 +209,9 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT @@ -221,23 +225,23 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT @@ -251,12 +255,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE-NEXT: addq $120, %rsp +; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-SSE-NEXT: addq $104, %rsp ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll index 5d11f1c960a8c..ad778b4970cbf 100644 --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -5,8 +5,6 @@ define void @foo(double %arg) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: movl %eax, (%rax) ; CHECK-NEXT: movq $0, (%rax) diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 6b94303902d6c..dbbcc5e9c1b7f 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1925,7 +1925,6 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 ; X86-SSE-NEXT: movd %ecx, %xmm1 @@ -1949,7 +1948,6 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-SSE-NEXT: psrad $16, %xmm0 ; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 ; X64-SSE-NEXT: movd %ecx, %xmm1 diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll index eb44e10b188ac..953a0d65c5386 100644 --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -65,7 +65,7 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) { ; ; CHECK-X64-LABEL: fail: ; CHECK-X64: # %bb.0: -; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-X64-NEXT: pslld $8, %xmm0 ; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: xorb $1, %al diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index fb007c7ee8aaf..1353d6ff1593b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1323,34 +1323,19 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % } define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: psllw %xmm2, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psllw %xmm2, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_funnnel_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: psllw %xmm2, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: psllw %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: psllw %xmm2, %xmm3 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: psllw %xmm2, %xmm1 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v16i8: ; AVX: # %bb.0: @@ -1448,7 +1433,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 27f7204b4bdd4..edd2678c423ed 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1060,32 +1060,18 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind } define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: psllw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_funnnel_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE41-NEXT: psllw %xmm1, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: psllw %xmm1, %xmm2 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psllw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v16i8: ; AVX: # %bb.0: @@ -1186,10 +1172,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psllw %xmm1, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index f10fec2638487..173fecb3edb3b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1451,7 +1451,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 @@ -1593,7 +1592,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 15b3e9c43413c..507316752e19d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1104,10 +1104,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: psrlw %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm2 @@ -1249,10 +1248,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psrlw %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 5e3ef32ef7e4a..a02d5591c42f7 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -19,8 +19,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -34,7 +33,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1OR2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1OR2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1OR2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1OR2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -48,7 +47,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -62,7 +61,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -107,8 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -131,7 +129,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -155,7 +153,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -179,7 +177,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -203,7 +201,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -274,8 +272,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -315,7 +312,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -347,7 +344,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -380,7 +377,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -413,7 +410,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -527,8 +524,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -602,7 +598,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -650,7 +646,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -691,7 +687,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -732,7 +728,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -808,10 +804,8 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4i32: @@ -838,21 +832,15 @@ define i32 @test_v4i32(<4 x i32> %a0) { define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-LABEL: test_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE2-NEXT: pmuludq %xmm3, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; @@ -923,19 +911,13 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq @@ -1043,21 +1025,15 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 1a8aa809e5db5..aa787921fface 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -933,32 +933,18 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { } define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_rotate_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: psllw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_rotate_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE41-NEXT: psllw %xmm1, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_rotate_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: psllw %xmm1, %xmm2 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psllw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_rotate_v16i8: ; AVX: # %bb.0: @@ -999,10 +985,9 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; X86-SSE2-LABEL: splatvar_rotate_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psllw %xmm1, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index de6e79550e869..2df013d0ff3e3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -377,9 +377,9 @@ define void @PR39483() { ; X86-AVX1-NEXT: vmovups 64, %ymm1 ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7] -; X86-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] +; X86-AVX1-NEXT: vmovups 16, %xmm2 +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] ; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -417,9 +417,9 @@ define void @PR39483() { ; X64-AVX1-NEXT: vmovups 64, %ymm1 ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7] -; X64-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] +; X64-AVX1-NEXT: vmovups 16, %xmm2 +; X64-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 02ff8a33cfbd3..12d494c32b656 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -851,20 +851,23 @@ define <16 x i8> @constant_fold_pshufb_2() { define i32 @mask_zzz3_v16i8(<16 x i8> %a0) { ; SSSE3-LABEL: mask_zzz3_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_zzz3_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] +; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSE41-NEXT: retq ; ; AVX-LABEL: mask_zzz3_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] +; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = bitcast <16 x i8> %1 to <4 x i32> diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index bfd25aa667d2d..f87abc1a079b9 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -770,14 +770,24 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) { ; SSE-NEXT: shll $15, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: vselect_any_extend_vector_inreg_crash: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: andl $1, %eax -; AVX-NEXT: shll $15, %eax -; AVX-NEXT: retq +; AVX1-LABEL: vselect_any_extend_vector_inreg_crash: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: shll $15, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: vselect_any_extend_vector_inreg_crash: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [49,49,49,49] +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: shll $15, %eax +; AVX2-NEXT: retq 0: %1 = load <8 x i8>, ptr %x %2 = icmp eq <8 x i8> %1,